fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
@@ -0,0 +1,36 @@
+"""
+Orchestration Layer for Breakpilot Agents
+
+Provides:
+- MessageBus: Inter-agent communication via Redis Pub/Sub
+- AgentSupervisor: Agent lifecycle and health management
+- TaskRouter: Intent-based task routing
+"""
+
+from agent_core.orchestrator.message_bus import (
+    MessageBus,
+    AgentMessage,
+    MessagePriority,
+)
+from agent_core.orchestrator.supervisor import (
+    AgentSupervisor,
+    AgentInfo,
+    AgentStatus,
+)
+from agent_core.orchestrator.task_router import (
+    TaskRouter,
+    RoutingResult,
+    RoutingStrategy,
+)
+
+__all__ = [
+    "MessageBus",
+    "AgentMessage",
+    "MessagePriority",
+    "AgentSupervisor",
+    "AgentInfo",
+    "AgentStatus",
+    "TaskRouter",
+    "RoutingResult",
+    "RoutingStrategy",
+]
@@ -0,0 +1,479 @@
+"""
+Message Bus for Inter-Agent Communication
+
+Provides:
+- Pub/Sub messaging via Redis/Valkey
+- Request-Response pattern with timeouts
+- Priority-based message handling
+- Message persistence for audit
+"""
+
+from typing import Callable, Dict, Any, List, Optional, Awaitable
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+import asyncio
+import uuid
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class MessagePriority(Enum):
+    """Message priority levels"""
+    LOW = 0
+    NORMAL = 1
+    HIGH = 2
+    CRITICAL = 3
+
+
+class MessageType(Enum):
+    """Standard message types"""
+    REQUEST = "request"
+    RESPONSE = "response"
+    EVENT = "event"
+    BROADCAST = "broadcast"
+    HEARTBEAT = "heartbeat"
+
+
+@dataclass
+class AgentMessage:
+    """Represents a message between agents"""
+    sender: str
+    receiver: str
+    message_type: str
+    payload: Dict[str, Any]
+    priority: MessagePriority = MessagePriority.NORMAL
+    correlation_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    reply_to: Optional[str] = None
+    expires_at: Optional[datetime] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "sender": self.sender,
+            "receiver": self.receiver,
+            "message_type": self.message_type,
+            "payload": self.payload,
+            "priority": self.priority.value,
+            "correlation_id": self.correlation_id,
+            "timestamp": self.timestamp.isoformat(),
+            "reply_to": self.reply_to,
+            "expires_at": self.expires_at.isoformat() if self.expires_at else None,
+            "metadata": self.metadata
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "AgentMessage":
+        return cls(
+            sender=data["sender"],
+            receiver=data["receiver"],
+            message_type=data["message_type"],
+            payload=data["payload"],
+            priority=MessagePriority(data.get("priority", 1)),
+            correlation_id=data.get("correlation_id", str(uuid.uuid4())),
+            timestamp=datetime.fromisoformat(data["timestamp"]) if "timestamp" in data else datetime.now(timezone.utc),
+            reply_to=data.get("reply_to"),
+            expires_at=datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None,
+            metadata=data.get("metadata", {})
+        )
+
+
+MessageHandler = Callable[[AgentMessage], Awaitable[Optional[Dict[str, Any]]]]
+
+
+class MessageBus:
+    """
+    Inter-agent communication via Redis Pub/Sub.
+
+    Features:
+    - Publish/Subscribe pattern
+    - Request/Response with timeout
+    - Priority queues
+    - Message persistence for audit
+    """
+
+    def __init__(
+        self,
+        redis_client=None,
+        db_pool=None,
+        namespace: str = "breakpilot",
+        persist_messages: bool = True
+    ):
+        """
+        Initialize the message bus.
+
+        Args:
+            redis_client: Async Redis/Valkey client
+            db_pool: Async PostgreSQL pool for persistence
+            namespace: Channel namespace
+            persist_messages: Whether to persist messages for audit
+        """
+        self.redis = redis_client
+        self.db_pool = db_pool
+        self.namespace = namespace
+        self.persist_messages = persist_messages
+
+        self._handlers: Dict[str, List[MessageHandler]] = {}
+        self._pending_responses: Dict[str, asyncio.Future] = {}
+        self._subscriptions: Dict[str, asyncio.Task] = {}
+        self._running = False
+
+    def _channel(self, agent_id: str) -> str:
+        """Generate channel name for agent"""
+        return f"{self.namespace}:agent:{agent_id}"
+
+    def _broadcast_channel(self) -> str:
+        """Generate broadcast channel name"""
+        return f"{self.namespace}:broadcast"
+
+    async def start(self) -> None:
+        """Starts the message bus"""
+        self._running = True
+        logger.info("Message bus started")
+
+    async def stop(self) -> None:
+        """Stops the message bus and all subscriptions"""
+        self._running = False
+
+        # Cancel all subscription tasks
+        for task in self._subscriptions.values():
+            task.cancel()
+
+        # Wait for cancellation
+        if self._subscriptions:
+            await asyncio.gather(
+                *self._subscriptions.values(),
+                return_exceptions=True
+            )
+
+        self._subscriptions.clear()
+        logger.info("Message bus stopped")
+
+    async def subscribe(
+        self,
+        agent_id: str,
+        handler: MessageHandler
+    ) -> None:
+        """
+        Subscribe an agent to receive messages.
+
+        Args:
+            agent_id: The agent ID to subscribe
+            handler: Async function to handle incoming messages
+        """
+        if agent_id in self._subscriptions:
+            logger.warning(f"Agent {agent_id} already subscribed")
+            return
+
+        if agent_id not in self._handlers:
+            self._handlers[agent_id] = []
+        self._handlers[agent_id].append(handler)
+
+        if self.redis:
+            # Start Redis subscription
+            task = asyncio.create_task(
+                self._subscription_loop(agent_id)
+            )
+            self._subscriptions[agent_id] = task
+
+        logger.info(f"Agent {agent_id} subscribed to message bus")
+
+    async def unsubscribe(self, agent_id: str) -> None:
+        """
+        Unsubscribe an agent from messages.
+
+        Args:
+            agent_id: The agent ID to unsubscribe
+        """
+        if agent_id in self._subscriptions:
+            self._subscriptions[agent_id].cancel()
+            try:
+                await self._subscriptions[agent_id]
+            except asyncio.CancelledError:
+                pass
+            del self._subscriptions[agent_id]
+
+        self._handlers.pop(agent_id, None)
+        logger.info(f"Agent {agent_id} unsubscribed from message bus")
+
+    async def _subscription_loop(self, agent_id: str) -> None:
+        """Main subscription loop for an agent"""
+        if not self.redis:
+            return
+
+        channel = self._channel(agent_id)
+        broadcast = self._broadcast_channel()
+
+        pubsub = self.redis.pubsub()
+        await pubsub.subscribe(channel, broadcast)
+
+        try:
+            while self._running:
+                message = await pubsub.get_message(
+                    ignore_subscribe_messages=True,
+                    timeout=1.0
+                )
+
+                if message and message["type"] == "message":
+                    await self._handle_incoming_message(
+                        agent_id,
+                        message["data"]
+                    )
+        except asyncio.CancelledError:
+            pass
+        finally:
+            await pubsub.unsubscribe(channel, broadcast)
+            await pubsub.close()
+
+    async def _handle_incoming_message(
+        self,
+        agent_id: str,
+        raw_data: bytes
+    ) -> None:
+        """Process an incoming message"""
+        try:
+            data = json.loads(raw_data)
+            message = AgentMessage.from_dict(data)
+
+            # Check if this is a response to a pending request
+            if message.correlation_id in self._pending_responses:
+                future = self._pending_responses[message.correlation_id]
+                if not future.done():
+                    future.set_result(message.payload)
+                return
+
+            # Call handlers
+            handlers = self._handlers.get(agent_id, [])
+            for handler in handlers:
+                try:
+                    response = await handler(message)
+
+                    # If handler returns data and there's a reply_to, send response
+                    if response and message.reply_to:
+                        await self.publish(AgentMessage(
+                            sender=agent_id,
+                            receiver=message.sender,
+                            message_type="response",
+                            payload=response,
+                            correlation_id=message.correlation_id
+                        ))
+
+                except Exception as e:
+                    logger.error(
+                        f"Error in message handler for {agent_id}: {e}"
+                    )
+
+        except Exception as e:
+            logger.error(f"Error processing message: {e}")
+
+    async def publish(self, message: AgentMessage) -> None:
+        """
+        Publishes a message to an agent.
+
+        Args:
+            message: The message to publish
+        """
+        # Persist message if enabled
+        if self.persist_messages:
+            await self._persist_message(message)
+
+        if self.redis:
+            channel = self._channel(message.receiver)
+            await self.redis.publish(
+                channel,
+                json.dumps(message.to_dict())
+            )
+        else:
+            # Local delivery for testing
+            await self._local_deliver(message)
+
+        logger.debug(
+            f"Published message from {message.sender} to {message.receiver}: "
+            f"{message.message_type}"
+        )
+
+    async def broadcast(self, message: AgentMessage) -> None:
+        """
+        Broadcasts a message to all agents.
+
+        Args:
+            message: The message to broadcast
+        """
+        message.receiver = "*"  # Indicate broadcast
+
+        if self.persist_messages:
+            await self._persist_message(message)
+
+        if self.redis:
+            await self.redis.publish(
+                self._broadcast_channel(),
+                json.dumps(message.to_dict())
+            )
+        else:
+            # Local broadcast
+            for agent_id in self._handlers:
+                await self._local_deliver(message, agent_id)
+
+        logger.debug(f"Broadcast message from {message.sender}: {message.message_type}")
+
+    async def request(
+        self,
+        message: AgentMessage,
+        timeout: float = 30.0
+    ) -> Dict[str, Any]:
+        """
+        Sends a request and waits for a response.
+
+        Args:
+            message: The request message
+            timeout: Timeout in seconds
+
+        Returns:
+            Response payload
+
+        Raises:
+            TimeoutError: If no response within timeout
+        """
+        # Mark this as a request that needs a response
+        message.reply_to = message.sender
+
+        # Create future for response
+        future: asyncio.Future = asyncio.Future()
+        self._pending_responses[message.correlation_id] = future
+
+        try:
+            # Publish the request
+            await self.publish(message)
+
+            # Wait for response
+            return await asyncio.wait_for(future, timeout)
+
+        except asyncio.TimeoutError:
+            logger.warning(
+                f"Request timeout: {message.sender} -> {message.receiver} "
+                f"({message.message_type})"
+            )
+            raise
+
+        finally:
+            # Clean up
+            self._pending_responses.pop(message.correlation_id, None)
+
+    async def _local_deliver(
+        self,
+        message: AgentMessage,
+        target_agent: Optional[str] = None
+    ) -> None:
+        """Local message delivery for testing without Redis"""
+        agent_id = target_agent or message.receiver
+        handlers = self._handlers.get(agent_id, [])
+
+        for handler in handlers:
+            try:
+                response = await handler(message)
+                if response and message.reply_to:
+                    if message.correlation_id in self._pending_responses:
+                        future = self._pending_responses[message.correlation_id]
+                        if not future.done():
+                            future.set_result(response)
+            except Exception as e:
+                logger.error(f"Error in local handler: {e}")
+
+    async def _persist_message(self, message: AgentMessage) -> None:
+        """Persist message to PostgreSQL for audit"""
+        if not self.db_pool:
+            return
+
+        try:
+            async with self.db_pool.acquire() as conn:
+                await conn.execute(
+                    """
+                    INSERT INTO agent_messages
+                        (id, sender, receiver, message_type, payload,
+                         priority, correlation_id, created_at)
+                    VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
+                    """,
+                    str(uuid.uuid4()),
+                    message.sender,
+                    message.receiver,
+                    message.message_type,
+                    json.dumps(message.payload),
+                    message.priority.value,
+                    message.correlation_id,
+                    message.timestamp
+                )
+        except Exception as e:
+            logger.warning(f"Failed to persist message: {e}")
+
+    def on_message(self, message_type: str):
+        """
+        Decorator for message handlers.
+
+        Usage:
+            @bus.on_message("grade_request")
+            async def handle_grade(message):
+                return {"score": 85}
+        """
+        def decorator(func: MessageHandler):
+            async def wrapper(message: AgentMessage):
+                if message.message_type == message_type:
+                    return await func(message)
+                return None
+            return wrapper
+        return decorator
+
+    async def get_message_history(
+        self,
+        agent_id: Optional[str] = None,
+        message_type: Optional[str] = None,
+        limit: int = 100
+    ) -> List[Dict[str, Any]]:
+        """
+        Gets message history from persistence.
+
+        Args:
+            agent_id: Filter by sender or receiver
+            message_type: Filter by message type
+            limit: Maximum results
+
+        Returns:
+            List of message dicts
+        """
+        if not self.db_pool:
+            return []
+
+        query = """
+            SELECT sender, receiver, message_type, payload, priority,
+                   correlation_id, created_at
+            FROM agent_messages
+            WHERE 1=1
+        """
+        params = []
+
+        if agent_id:
+            query += " AND (sender = $1 OR receiver = $1)"
+            params.append(agent_id)
+
+        if message_type:
+            param_num = len(params) + 1
+            query += f" AND message_type = ${param_num}"
+            params.append(message_type)
+
+        query += f" ORDER BY created_at DESC LIMIT {limit}"
+
+        async with self.db_pool.acquire() as conn:
+            rows = await conn.fetch(query, *params)
+            return [dict(row) for row in rows]
+
+    @property
+    def connected(self) -> bool:
+        """Returns whether the bus is connected to Redis"""
+        return self.redis is not None and self._running
+
+    @property
+    def subscriber_count(self) -> int:
+        """Returns number of subscribed agents"""
+        return len(self._subscriptions)
@@ -0,0 +1,553 @@
+"""
+Agent Supervisor for Breakpilot
+
+Provides:
+- Agent lifecycle management
+- Health monitoring
+- Restart policies
+- Load balancing
+"""
+
+from typing import Dict, Optional, Callable, Awaitable, List, Any
+from dataclasses import dataclass, field
+from datetime import datetime, timezone, timedelta
+from enum import Enum
+import asyncio
+import logging
+
+from agent_core.sessions.heartbeat import HeartbeatMonitor
+from agent_core.orchestrator.message_bus import (
+    MessageBus,
+    AgentMessage,
+    MessagePriority,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class AgentStatus(Enum):
+    """Agent lifecycle states"""
+    INITIALIZING = "initializing"
+    STARTING = "starting"
+    RUNNING = "running"
+    PAUSED = "paused"
+    STOPPING = "stopping"
+    STOPPED = "stopped"
+    ERROR = "error"
+    RESTARTING = "restarting"
+
+
+class RestartPolicy(Enum):
+    """Agent restart policies"""
+    NEVER = "never"
+    ON_FAILURE = "on_failure"
+    ALWAYS = "always"
+
+
+@dataclass
+class AgentInfo:
+    """Information about a registered agent"""
+    agent_id: str
+    agent_type: str
+    status: AgentStatus = AgentStatus.INITIALIZING
+    current_task: Optional[str] = None
+    started_at: Optional[datetime] = None
+    last_activity: Optional[datetime] = None
+    error_count: int = 0
+    restart_count: int = 0
+    max_restarts: int = 3
+    restart_policy: RestartPolicy = RestartPolicy.ON_FAILURE
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    capacity: int = 10  # Max concurrent tasks
+    current_load: int = 0
+
+    def is_healthy(self) -> bool:
+        """Check if agent is healthy"""
+        return self.status == AgentStatus.RUNNING and self.error_count < 3
+
+    def is_available(self) -> bool:
+        """Check if agent can accept new tasks"""
+        return (
+            self.status == AgentStatus.RUNNING and
+            self.current_load < self.capacity
+        )
+
+    def utilization(self) -> float:
+        """Returns agent utilization (0-1)"""
+        return self.current_load / max(self.capacity, 1)
+
+
+AgentFactory = Callable[[str], Awaitable[Any]]
+
+
+class AgentSupervisor:
+    """
+    Supervises and coordinates all agents.
+
+    Responsibilities:
+    - Agent registration and lifecycle
+    - Health monitoring via heartbeat
+    - Restart policies
+    - Load balancing
+    - Alert escalation
+    """
+
+    def __init__(
+        self,
+        message_bus: MessageBus,
+        heartbeat_monitor: Optional[HeartbeatMonitor] = None,
+        check_interval_seconds: int = 10
+    ):
+        """
+        Initialize the supervisor.
+
+        Args:
+            message_bus: Message bus for inter-agent communication
+            heartbeat_monitor: Heartbeat monitor for liveness checks
+            check_interval_seconds: How often to run health checks
+        """
+        self.bus = message_bus
+        self.heartbeat = heartbeat_monitor or HeartbeatMonitor()
+        self.check_interval = check_interval_seconds
+
+        self.agents: Dict[str, AgentInfo] = {}
+        self._factories: Dict[str, AgentFactory] = {}
+        self._running = False
+        self._health_task: Optional[asyncio.Task] = None
+
+        # Set up heartbeat timeout handler
+        self.heartbeat.on_timeout = self._handle_agent_timeout
+
+    async def start(self) -> None:
+        """Starts the supervisor"""
+        self._running = True
+        await self.heartbeat.start_monitoring()
+
+        # Start health check loop
+        self._health_task = asyncio.create_task(self._health_check_loop())
+
+        logger.info("Agent supervisor started")
+
+    async def stop(self) -> None:
+        """Stops the supervisor and all agents"""
+        self._running = False
+
+        # Stop health check
+        if self._health_task:
+            self._health_task.cancel()
+            try:
+                await self._health_task
+            except asyncio.CancelledError:
+                pass
+
+        # Stop heartbeat monitor
+        await self.heartbeat.stop_monitoring()
+
+        # Stop all agents
+        for agent_id in list(self.agents.keys()):
+            await self.stop_agent(agent_id)
+
+        logger.info("Agent supervisor stopped")
+
+    def register_factory(
+        self,
+        agent_type: str,
+        factory: AgentFactory
+    ) -> None:
+        """
+        Registers a factory function for creating agents.
+
+        Args:
+            agent_type: Type of agent this factory creates
+            factory: Async function that creates agent instances
+        """
+        self._factories[agent_type] = factory
+        logger.debug(f"Registered factory for agent type: {agent_type}")
+
+    async def register_agent(
+        self,
+        agent_id: str,
+        agent_type: str,
+        restart_policy: RestartPolicy = RestartPolicy.ON_FAILURE,
+        max_restarts: int = 3,
+        capacity: int = 10,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> AgentInfo:
+        """
+        Registers a new agent with the supervisor.
+
+        Args:
+            agent_id: Unique agent identifier
+            agent_type: Type of agent
+            restart_policy: When to restart the agent
+            max_restarts: Maximum restart attempts
+            capacity: Max concurrent tasks
+            metadata: Additional agent metadata
+
+        Returns:
+            AgentInfo for the registered agent
+        """
+        if agent_id in self.agents:
+            logger.warning(f"Agent {agent_id} already registered")
+            return self.agents[agent_id]
+
+        info = AgentInfo(
+            agent_id=agent_id,
+            agent_type=agent_type,
+            restart_policy=restart_policy,
+            max_restarts=max_restarts,
+            capacity=capacity,
+            metadata=metadata or {}
+        )
+
+        self.agents[agent_id] = info
+        self.heartbeat.register(agent_id, agent_type)
+
+        logger.info(f"Registered agent: {agent_id} ({agent_type})")
+
+        return info
+
+    async def start_agent(self, agent_id: str) -> bool:
+        """
+        Starts a registered agent.
+
+        Args:
+            agent_id: The agent to start
+
+        Returns:
+            True if started successfully
+        """
+        if agent_id not in self.agents:
+            logger.error(f"Cannot start unregistered agent: {agent_id}")
+            return False
+
+        info = self.agents[agent_id]
+
+        if info.status == AgentStatus.RUNNING:
+            logger.warning(f"Agent {agent_id} is already running")
+            return True
+
+        info.status = AgentStatus.STARTING
+
+        try:
+            # If we have a factory, create the agent
+            if info.agent_type in self._factories:
+                factory = self._factories[info.agent_type]
+                await factory(agent_id)
+
+            info.status = AgentStatus.RUNNING
+            info.started_at = datetime.now(timezone.utc)
+            info.last_activity = info.started_at
+
+            # Subscribe to message bus
+            await self.bus.subscribe(
+                agent_id,
+                self._create_message_handler(agent_id)
+            )
+
+            logger.info(f"Started agent: {agent_id}")
+            return True
+
+        except Exception as e:
+            info.status = AgentStatus.ERROR
+            info.error_count += 1
+            logger.error(f"Failed to start agent {agent_id}: {e}")
+            return False
+
+    async def stop_agent(self, agent_id: str) -> bool:
+        """
+        Stops a running agent.
+
+        Args:
+            agent_id: The agent to stop
+
+        Returns:
+            True if stopped successfully
+        """
+        if agent_id not in self.agents:
+            return False
+
+        info = self.agents[agent_id]
+        info.status = AgentStatus.STOPPING
+
+        try:
+            # Unsubscribe from message bus
+            await self.bus.unsubscribe(agent_id)
+
+            # Unregister from heartbeat
+            self.heartbeat.unregister(agent_id)
+
+            info.status = AgentStatus.STOPPED
+            logger.info(f"Stopped agent: {agent_id}")
+            return True
+
+        except Exception as e:
+            info.status = AgentStatus.ERROR
+            logger.error(f"Error stopping agent {agent_id}: {e}")
+            return False
+
+    async def restart_agent(self, agent_id: str) -> bool:
+        """
+        Restarts an agent.
+
+        Args:
+            agent_id: The agent to restart
+
+        Returns:
+            True if restarted successfully
+        """
+        if agent_id not in self.agents:
+            return False
+
+        info = self.agents[agent_id]
+
+        # Check restart count
+        if info.restart_count >= info.max_restarts:
+            logger.error(
+                f"Agent {agent_id} exceeded max restarts "
+                f"({info.restart_count}/{info.max_restarts})"
+            )
+            await self._escalate_agent_failure(agent_id)
+            return False
+
+        info.status = AgentStatus.RESTARTING
+        info.restart_count += 1
+
+        logger.info(
+            f"Restarting agent {agent_id} "
+            f"(attempt {info.restart_count}/{info.max_restarts})"
+        )
+
+        # Stop and start
+        await self.stop_agent(agent_id)
+        await asyncio.sleep(1)  # Brief pause
+        return await self.start_agent(agent_id)
+
+    async def _handle_agent_timeout(
+        self,
+        session_id: str,
+        agent_type: str
+    ) -> None:
+        """Handles agent heartbeat timeout"""
+        # Find the agent by session/ID
+        agent_id = session_id  # In our case, session_id == agent_id
+
+        if agent_id not in self.agents:
+            return
+
+        info = self.agents[agent_id]
+        info.status = AgentStatus.ERROR
+        info.error_count += 1
+
+        logger.warning(f"Agent {agent_id} timed out (heartbeat)")
+
+        # Apply restart policy
+        if info.restart_policy == RestartPolicy.NEVER:
+            await self._escalate_agent_failure(agent_id)
+        elif info.restart_policy == RestartPolicy.ON_FAILURE:
+            if info.restart_count < info.max_restarts:
+                await self.restart_agent(agent_id)
+            else:
+                await self._escalate_agent_failure(agent_id)
+        elif info.restart_policy == RestartPolicy.ALWAYS:
+            await self.restart_agent(agent_id)
+
+    async def _escalate_agent_failure(self, agent_id: str) -> None:
+        """Escalates an agent failure to the alert system"""
+        info = self.agents.get(agent_id)
+        if not info:
+            return
+
+        await self.bus.publish(AgentMessage(
+            sender="supervisor",
+            receiver="alert-agent",
+            message_type="agent_failure",
+            payload={
+                "agent_id": agent_id,
+                "agent_type": info.agent_type,
+                "error_count": info.error_count,
+                "restart_count": info.restart_count,
+                "last_activity": info.last_activity.isoformat() if info.last_activity else None
+            },
+            priority=MessagePriority.CRITICAL
+        ))
+
+        logger.error(f"Escalated agent failure: {agent_id}")
+
+    def _create_message_handler(self, agent_id: str):
+        """Creates a message handler that updates agent activity"""
+        async def handler(message: AgentMessage):
+            if agent_id in self.agents:
+                self.agents[agent_id].last_activity = datetime.now(timezone.utc)
+                # Heartbeat on activity
+                self.heartbeat.beat(agent_id)
+            return None
+        return handler
+
+    async def _health_check_loop(self) -> None:
+        """Periodic health check loop"""
+        while self._running:
+            try:
+                await asyncio.sleep(self.check_interval)
+                await self._run_health_checks()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in health check loop: {e}")
+
+    async def _run_health_checks(self) -> None:
+        """Runs health checks on all agents"""
+        now = datetime.now(timezone.utc)
+
+        for agent_id, info in list(self.agents.items()):
+            if info.status != AgentStatus.RUNNING:
+                continue
+
+            # Check for stale agents (no activity for 5 minutes)
+            if info.last_activity:
+                idle_time = now - info.last_activity
+                if idle_time > timedelta(minutes=5):
+                    logger.warning(
+                        f"Agent {agent_id} has been idle for "
+                        f"{idle_time.total_seconds():.0f}s"
+                    )
+
+    # Load Balancing
+
+    def get_available_agent(
+        self,
+        agent_type: str,
+        strategy: str = "least_loaded"
+    ) -> Optional[str]:
+        """
+        Gets an available agent of the specified type.
+
+        Args:
+            agent_type: Type of agent needed
+            strategy: Load balancing strategy
+
+        Returns:
+            Agent ID or None if none available
+        """
+        candidates = [
+            info for info in self.agents.values()
+            if info.agent_type == agent_type and info.is_available()
+        ]
+
+        if not candidates:
+            return None
+
+        if strategy == "least_loaded":
+            # Pick agent with lowest load
+            best = min(candidates, key=lambda a: a.utilization())
+        elif strategy == "round_robin":
+            # Simple round-robin (just pick first available)
+            best = candidates[0]
+        else:
+            best = candidates[0]
+
+        return best.agent_id
+
+    def acquire_capacity(self, agent_id: str) -> bool:
+        """
+        Acquires capacity from an agent.
+
+        Args:
+            agent_id: The agent to acquire from
+
+        Returns:
+            True if capacity was acquired
+        """
+        if agent_id not in self.agents:
+            return False
+
+        info = self.agents[agent_id]
+        if not info.is_available():
+            return False
+
+        info.current_load += 1
+        return True
+
+    def release_capacity(self, agent_id: str) -> None:
+        """
+        Releases capacity back to an agent.
+
+        Args:
+            agent_id: The agent to release to
+        """
+        if agent_id in self.agents:
+            info = self.agents[agent_id]
+            info.current_load = max(0, info.current_load - 1)
+
+    # Status and Metrics
+
+    def get_agent_status(self, agent_id: str) -> Optional[Dict[str, Any]]:
+        """Gets status information for an agent"""
+        if agent_id not in self.agents:
+            return None
+
+        info = self.agents[agent_id]
+        return {
+            "agent_id": info.agent_id,
+            "agent_type": info.agent_type,
+            "status": info.status.value,
+            "current_task": info.current_task,
+            "started_at": info.started_at.isoformat() if info.started_at else None,
+            "last_activity": info.last_activity.isoformat() if info.last_activity else None,
+            "error_count": info.error_count,
+            "restart_count": info.restart_count,
+            "utilization": info.utilization(),
+            "is_healthy": info.is_healthy(),
+            "is_available": info.is_available()
+        }
+
+    def get_all_status(self) -> Dict[str, Dict[str, Any]]:
+        """Gets status for all agents"""
+        return {
+            agent_id: self.get_agent_status(agent_id)
+            for agent_id in self.agents
+        }
+
+    def get_metrics(self) -> Dict[str, Any]:
+        """Gets aggregate metrics"""
+        total = len(self.agents)
+        running = sum(
+            1 for a in self.agents.values()
+            if a.status == AgentStatus.RUNNING
+        )
+        healthy = sum(1 for a in self.agents.values() if a.is_healthy())
+        available = sum(1 for a in self.agents.values() if a.is_available())
+
+        total_capacity = sum(a.capacity for a in self.agents.values())
+        total_load = sum(a.current_load for a in self.agents.values())
+
+        return {
+            "total_agents": total,
+            "running_agents": running,
+            "healthy_agents": healthy,
+            "available_agents": available,
+            "total_capacity": total_capacity,
+            "total_load": total_load,
+            "overall_utilization": total_load / max(total_capacity, 1),
+            "by_type": self._metrics_by_type()
+        }
+
+    def _metrics_by_type(self) -> Dict[str, Dict[str, int]]:
+        """Gets metrics grouped by agent type"""
+        by_type: Dict[str, Dict[str, int]] = {}
+
+        for info in self.agents.values():
+            if info.agent_type not in by_type:
+                by_type[info.agent_type] = {
+                    "total": 0,
+                    "running": 0,
+                    "healthy": 0
+                }
+
+            by_type[info.agent_type]["total"] += 1
+            if info.status == AgentStatus.RUNNING:
+                by_type[info.agent_type]["running"] += 1
+            if info.is_healthy():
+                by_type[info.agent_type]["healthy"] += 1
+
+        return by_type
@@ -0,0 +1,436 @@
+"""
+Task Router for Intent-Based Routing
+
+Provides:
+- Intent classification
+- Agent selection
+- Fallback handling
+- Routing metrics
+"""
+
+from typing import Dict, Optional, List, Any, Callable, Awaitable
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+
+class RoutingStrategy(Enum):
+    """Routing strategies"""
+    DIRECT = "direct"           # Route to specific agent
+    ROUND_ROBIN = "round_robin"  # Distribute evenly
+    LEAST_LOADED = "least_loaded"  # Route to least loaded
+    PRIORITY = "priority"        # Route based on priority
+
+
+@dataclass
+class RoutingRule:
+    """A rule for routing tasks to agents"""
+    intent_pattern: str
+    agent_type: str
+    priority: int = 0
+    conditions: Dict[str, Any] = field(default_factory=dict)
+    fallback_agent: Optional[str] = None
+
+    def matches(self, intent: str, context: Dict[str, Any]) -> bool:
+        """Check if this rule matches the intent and context"""
+        # Check intent pattern (supports wildcards)
+        pattern = self.intent_pattern.replace("*", ".*")
+        if not re.match(f"^{pattern}$", intent, re.IGNORECASE):
+            return False
+
+        # Check conditions
+        for key, value in self.conditions.items():
+            if context.get(key) != value:
+                return False
+
+        return True
+
+
+@dataclass
+class RoutingResult:
+    """Result of a routing decision"""
+    success: bool
+    agent_id: Optional[str] = None
+    agent_type: Optional[str] = None
+    is_fallback: bool = False
+    reason: str = ""
+    routing_time_ms: float = 0
+
+
+class TaskRouter:
+    """
+    Routes tasks to appropriate agents based on intent.
+
+    Features:
+    - Pattern-based routing rules
+    - Priority ordering
+    - Fallback chains
+    - Routing metrics
+    """
+
+    def __init__(self, supervisor=None):
+        """
+        Initialize the task router.
+
+        Args:
+            supervisor: AgentSupervisor for agent availability
+        """
+        self.supervisor = supervisor
+        self.rules: List[RoutingRule] = []
+        self._default_routes: Dict[str, str] = {}
+        self._routing_history: List[Dict[str, Any]] = []
+        self._max_history = 1000
+
+        # Initialize default rules
+        self._setup_default_rules()
+
+    def _setup_default_rules(self) -> None:
+        """Sets up default routing rules"""
+        default_rules = [
+            # Learning support
+            RoutingRule(
+                intent_pattern="learning_*",
+                agent_type="tutor-agent",
+                priority=10,
+                fallback_agent="orchestrator"
+            ),
+            RoutingRule(
+                intent_pattern="help_*",
+                agent_type="tutor-agent",
+                priority=5
+            ),
+            RoutingRule(
+                intent_pattern="explain_*",
+                agent_type="tutor-agent",
+                priority=5
+            ),
+
+            # Grading
+            RoutingRule(
+                intent_pattern="grade_*",
+                agent_type="grader-agent",
+                priority=10,
+                fallback_agent="quality-judge"
+            ),
+            RoutingRule(
+                intent_pattern="evaluate_*",
+                agent_type="grader-agent",
+                priority=5
+            ),
+            RoutingRule(
+                intent_pattern="correct_*",
+                agent_type="grader-agent",
+                priority=5
+            ),
+
+            # Quality checks
+            RoutingRule(
+                intent_pattern="quality_*",
+                agent_type="quality-judge",
+                priority=10
+            ),
+            RoutingRule(
+                intent_pattern="review_*",
+                agent_type="quality-judge",
+                priority=5
+            ),
+
+            # Alerts
+            RoutingRule(
+                intent_pattern="alert_*",
+                agent_type="alert-agent",
+                priority=10
+            ),
+            RoutingRule(
+                intent_pattern="notify_*",
+                agent_type="alert-agent",
+                priority=5
+            ),
+
+            # System/Admin
+            RoutingRule(
+                intent_pattern="system_*",
+                agent_type="orchestrator",
+                priority=10
+            ),
+            RoutingRule(
+                intent_pattern="admin_*",
+                agent_type="orchestrator",
+                priority=10
+            ),
+        ]
+
+        for rule in default_rules:
+            self.add_rule(rule)
+
+    def add_rule(self, rule: RoutingRule) -> None:
+        """
+        Adds a routing rule.
+
+        Args:
+            rule: The routing rule to add
+        """
+        self.rules.append(rule)
+        # Sort by priority (higher first)
+        self.rules.sort(key=lambda r: r.priority, reverse=True)
+
+    def remove_rule(self, intent_pattern: str) -> bool:
+        """
+        Removes a routing rule by pattern.
+
+        Args:
+            intent_pattern: The pattern to remove
+
+        Returns:
+            True if a rule was removed
+        """
+        original_len = len(self.rules)
+        self.rules = [r for r in self.rules if r.intent_pattern != intent_pattern]
+        return len(self.rules) < original_len
+
+    def set_default_route(self, agent_type: str, agent_id: str) -> None:
+        """
+        Sets a default agent for a type.
+
+        Args:
+            agent_type: The agent type
+            agent_id: The default agent ID
+        """
+        self._default_routes[agent_type] = agent_id
+
+    async def route(
+        self,
+        intent: str,
+        context: Optional[Dict[str, Any]] = None,
+        strategy: RoutingStrategy = RoutingStrategy.LEAST_LOADED
+    ) -> RoutingResult:
+        """
+        Routes a task based on intent.
+
+        Args:
+            intent: The task intent
+            context: Additional context for routing
+            strategy: Load balancing strategy
+
+        Returns:
+            RoutingResult with agent assignment
+        """
+        start_time = datetime.now(timezone.utc)
+        context = context or {}
+
+        # Find matching rule
+        matching_rule = None
+        for rule in self.rules:
+            if rule.matches(intent, context):
+                matching_rule = rule
+                break
+
+        if not matching_rule:
+            result = RoutingResult(
+                success=False,
+                reason=f"No routing rule matches intent: {intent}"
+            )
+            self._record_routing(intent, result)
+            return result
+
+        # Get available agent
+        agent_id = await self._get_agent(
+            matching_rule.agent_type,
+            strategy
+        )
+
+        if agent_id:
+            result = RoutingResult(
+                success=True,
+                agent_id=agent_id,
+                agent_type=matching_rule.agent_type,
+                is_fallback=False,
+                reason="Primary agent selected"
+            )
+        elif matching_rule.fallback_agent:
+            # Try fallback
+            agent_id = await self._get_agent(
+                matching_rule.fallback_agent,
+                strategy
+            )
+            if agent_id:
+                result = RoutingResult(
+                    success=True,
+                    agent_id=agent_id,
+                    agent_type=matching_rule.fallback_agent,
+                    is_fallback=True,
+                    reason="Fallback agent selected"
+                )
+            else:
+                result = RoutingResult(
+                    success=False,
+                    reason="No agents available (primary or fallback)"
+                )
+        else:
+            result = RoutingResult(
+                success=False,
+                reason=f"No {matching_rule.agent_type} agents available"
+            )
+
+        # Calculate routing time
+        end_time = datetime.now(timezone.utc)
+        result.routing_time_ms = (end_time - start_time).total_seconds() * 1000
+
+        self._record_routing(intent, result)
+
+        return result
+
+    async def _get_agent(
+        self,
+        agent_type: str,
+        strategy: RoutingStrategy
+    ) -> Optional[str]:
+        """Gets an available agent of the specified type"""
+        # Check default route first
+        if agent_type in self._default_routes:
+            agent_id = self._default_routes[agent_type]
+            if self.supervisor and self.supervisor.agents.get(agent_id):
+                info = self.supervisor.agents[agent_id]
+                if info.is_available():
+                    return agent_id
+
+        # Use supervisor for load balancing
+        if self.supervisor:
+            strategy_str = "least_loaded"
+            if strategy == RoutingStrategy.ROUND_ROBIN:
+                strategy_str = "round_robin"
+
+            return self.supervisor.get_available_agent(
+                agent_type,
+                strategy_str
+            )
+
+        return None
+
+    def _record_routing(
+        self,
+        intent: str,
+        result: RoutingResult
+    ) -> None:
+        """Records routing decision for analytics"""
+        record = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "intent": intent,
+            "success": result.success,
+            "agent_id": result.agent_id,
+            "agent_type": result.agent_type,
+            "is_fallback": result.is_fallback,
+            "routing_time_ms": result.routing_time_ms,
+            "reason": result.reason
+        }
+
+        self._routing_history.append(record)
+
+        # Trim history
+        if len(self._routing_history) > self._max_history:
+            self._routing_history = self._routing_history[-self._max_history:]
+
+        # Log
+        if result.success:
+            logger.debug(
+                f"Routed '{intent}' to {result.agent_id} "
+                f"({'fallback' if result.is_fallback else 'primary'})"
+            )
+        else:
+            logger.warning(f"Failed to route '{intent}': {result.reason}")
+
+    # Analytics
+
+    def get_routing_stats(self) -> Dict[str, Any]:
+        """Gets routing statistics"""
+        if not self._routing_history:
+            return {
+                "total_routes": 0,
+                "success_rate": 0,
+                "fallback_rate": 0,
+                "avg_routing_time_ms": 0
+            }
+
+        total = len(self._routing_history)
+        successful = sum(1 for r in self._routing_history if r["success"])
+        fallbacks = sum(1 for r in self._routing_history if r["is_fallback"])
+        avg_time = sum(
+            r["routing_time_ms"] for r in self._routing_history
+        ) / total
+
+        return {
+            "total_routes": total,
+            "successful_routes": successful,
+            "success_rate": successful / total,
+            "fallback_routes": fallbacks,
+            "fallback_rate": fallbacks / max(successful, 1),
+            "avg_routing_time_ms": avg_time
+        }
+
+    def get_intent_distribution(self) -> Dict[str, int]:
+        """Gets distribution of routed intents"""
+        distribution: Dict[str, int] = {}
+        for record in self._routing_history:
+            intent = record["intent"]
+            # Extract base intent (before _)
+            base = intent.split("_")[0] if "_" in intent else intent
+            distribution[base] = distribution.get(base, 0) + 1
+        return distribution
+
+    def get_agent_distribution(self) -> Dict[str, int]:
+        """Gets distribution of routes by agent type"""
+        distribution: Dict[str, int] = {}
+        for record in self._routing_history:
+            agent_type = record.get("agent_type", "unknown")
+            if agent_type:
+                distribution[agent_type] = distribution.get(agent_type, 0) + 1
+        return distribution
+
+    def get_failure_reasons(self) -> Dict[str, int]:
+        """Gets distribution of routing failure reasons"""
+        reasons: Dict[str, int] = {}
+        for record in self._routing_history:
+            if not record["success"]:
+                reason = record["reason"]
+                reasons[reason] = reasons.get(reason, 0) + 1
+        return reasons
+
+    def clear_history(self) -> None:
+        """Clears routing history"""
+        self._routing_history.clear()
+
+    # Rule inspection
+
+    def get_rules(self) -> List[Dict[str, Any]]:
+        """Gets all routing rules as dicts"""
+        return [
+            {
+                "intent_pattern": r.intent_pattern,
+                "agent_type": r.agent_type,
+                "priority": r.priority,
+                "conditions": r.conditions,
+                "fallback_agent": r.fallback_agent
+            }
+            for r in self.rules
+        ]
+
+    def find_matching_rules(
+        self,
+        intent: str,
+        context: Optional[Dict[str, Any]] = None
+    ) -> List[Dict[str, Any]]:
+        """Finds all rules that match an intent"""
+        context = context or {}
+        return [
+            {
+                "intent_pattern": r.intent_pattern,
+                "agent_type": r.agent_type,
+                "priority": r.priority
+            }
+            for r in self.rules
+            if r.matches(intent, context)
+        ]