fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
361
agent-core/sessions/heartbeat.py
Normal file
361
agent-core/sessions/heartbeat.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""
|
||||
Heartbeat Monitoring for Breakpilot Agents
|
||||
|
||||
Provides liveness monitoring for agents with:
|
||||
- Configurable timeout thresholds
|
||||
- Async background monitoring
|
||||
- Callback-based timeout handling
|
||||
- Integration with SessionManager
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Dict, Callable, Optional, Awaitable, Set
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeartbeatEntry:
|
||||
"""Represents a heartbeat entry for an agent"""
|
||||
session_id: str
|
||||
agent_type: str
|
||||
last_beat: datetime
|
||||
missed_beats: int = 0
|
||||
|
||||
|
||||
class HeartbeatMonitor:
|
||||
"""
|
||||
Monitors agent heartbeats and triggers callbacks on timeout.
|
||||
|
||||
Usage:
|
||||
monitor = HeartbeatMonitor(timeout_seconds=30)
|
||||
monitor.on_timeout = handle_timeout
|
||||
await monitor.start_monitoring()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout_seconds: int = 30,
|
||||
check_interval_seconds: int = 5,
|
||||
max_missed_beats: int = 3
|
||||
):
|
||||
"""
|
||||
Initialize the heartbeat monitor.
|
||||
|
||||
Args:
|
||||
timeout_seconds: Time without heartbeat before considered stale
|
||||
check_interval_seconds: How often to check for stale sessions
|
||||
max_missed_beats: Number of missed beats before triggering timeout
|
||||
"""
|
||||
self.sessions: Dict[str, HeartbeatEntry] = {}
|
||||
self.timeout = timedelta(seconds=timeout_seconds)
|
||||
self.check_interval = check_interval_seconds
|
||||
self.max_missed_beats = max_missed_beats
|
||||
self.on_timeout: Optional[Callable[[str, str], Awaitable[None]]] = None
|
||||
self.on_warning: Optional[Callable[[str, int], Awaitable[None]]] = None
|
||||
self._running = False
|
||||
self._task: Optional[asyncio.Task] = None
|
||||
self._paused_sessions: Set[str] = set()
|
||||
|
||||
async def start_monitoring(self) -> None:
|
||||
"""
|
||||
Starts the background heartbeat monitoring task.
|
||||
|
||||
This runs indefinitely until stop_monitoring() is called.
|
||||
"""
|
||||
if self._running:
|
||||
logger.warning("Heartbeat monitor already running")
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._task = asyncio.create_task(self._monitoring_loop())
|
||||
logger.info(
|
||||
f"Heartbeat monitor started (timeout={self.timeout.seconds}s, "
|
||||
f"interval={self.check_interval}s)"
|
||||
)
|
||||
|
||||
async def stop_monitoring(self) -> None:
|
||||
"""Stops the heartbeat monitoring task"""
|
||||
self._running = False
|
||||
if self._task:
|
||||
self._task.cancel()
|
||||
try:
|
||||
await self._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._task = None
|
||||
logger.info("Heartbeat monitor stopped")
|
||||
|
||||
async def _monitoring_loop(self) -> None:
|
||||
"""Main monitoring loop"""
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
await self._check_heartbeats()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in heartbeat monitoring: {e}")
|
||||
|
||||
async def _check_heartbeats(self) -> None:
|
||||
"""Checks all registered sessions for stale heartbeats"""
|
||||
now = datetime.now(timezone.utc)
|
||||
timed_out = []
|
||||
|
||||
for session_id, entry in list(self.sessions.items()):
|
||||
# Skip paused sessions
|
||||
if session_id in self._paused_sessions:
|
||||
continue
|
||||
|
||||
time_since_beat = now - entry.last_beat
|
||||
|
||||
if time_since_beat > self.timeout:
|
||||
entry.missed_beats += 1
|
||||
|
||||
# Warn on first missed beat
|
||||
if entry.missed_beats == 1 and self.on_warning:
|
||||
await self.on_warning(session_id, entry.missed_beats)
|
||||
logger.warning(
|
||||
f"Session {session_id} missed heartbeat "
|
||||
f"({entry.missed_beats}/{self.max_missed_beats})"
|
||||
)
|
||||
|
||||
# Timeout after max missed beats
|
||||
if entry.missed_beats >= self.max_missed_beats:
|
||||
timed_out.append((session_id, entry.agent_type))
|
||||
|
||||
# Handle timeouts
|
||||
for session_id, agent_type in timed_out:
|
||||
logger.error(
|
||||
f"Session {session_id} ({agent_type}) timed out after "
|
||||
f"{self.max_missed_beats} missed heartbeats"
|
||||
)
|
||||
|
||||
if self.on_timeout:
|
||||
try:
|
||||
await self.on_timeout(session_id, agent_type)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in timeout handler: {e}")
|
||||
|
||||
# Remove from tracking
|
||||
del self.sessions[session_id]
|
||||
self._paused_sessions.discard(session_id)
|
||||
|
||||
def register(self, session_id: str, agent_type: str) -> None:
|
||||
"""
|
||||
Registers a session for heartbeat monitoring.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to monitor
|
||||
agent_type: The type of agent
|
||||
"""
|
||||
self.sessions[session_id] = HeartbeatEntry(
|
||||
session_id=session_id,
|
||||
agent_type=agent_type,
|
||||
last_beat=datetime.now(timezone.utc)
|
||||
)
|
||||
logger.debug(f"Registered session {session_id} for heartbeat monitoring")
|
||||
|
||||
def beat(self, session_id: str) -> bool:
|
||||
"""
|
||||
Records a heartbeat for a session.
|
||||
|
||||
Args:
|
||||
session_id: The session ID
|
||||
|
||||
Returns:
|
||||
True if the session is registered, False otherwise
|
||||
"""
|
||||
if session_id in self.sessions:
|
||||
self.sessions[session_id].last_beat = datetime.now(timezone.utc)
|
||||
self.sessions[session_id].missed_beats = 0
|
||||
return True
|
||||
return False
|
||||
|
||||
def unregister(self, session_id: str) -> bool:
|
||||
"""
|
||||
Unregisters a session from heartbeat monitoring.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to unregister
|
||||
|
||||
Returns:
|
||||
True if the session was registered, False otherwise
|
||||
"""
|
||||
self._paused_sessions.discard(session_id)
|
||||
if session_id in self.sessions:
|
||||
del self.sessions[session_id]
|
||||
logger.debug(f"Unregistered session {session_id} from heartbeat monitoring")
|
||||
return True
|
||||
return False
|
||||
|
||||
def pause(self, session_id: str) -> bool:
|
||||
"""
|
||||
Pauses heartbeat monitoring for a session.
|
||||
|
||||
Useful when a session is intentionally idle (e.g., waiting for user input).
|
||||
|
||||
Args:
|
||||
session_id: The session ID to pause
|
||||
|
||||
Returns:
|
||||
True if the session was registered, False otherwise
|
||||
"""
|
||||
if session_id in self.sessions:
|
||||
self._paused_sessions.add(session_id)
|
||||
logger.debug(f"Paused heartbeat monitoring for session {session_id}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def resume(self, session_id: str) -> bool:
|
||||
"""
|
||||
Resumes heartbeat monitoring for a paused session.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to resume
|
||||
|
||||
Returns:
|
||||
True if the session was paused, False otherwise
|
||||
"""
|
||||
if session_id in self._paused_sessions:
|
||||
self._paused_sessions.discard(session_id)
|
||||
# Reset the heartbeat timer
|
||||
self.beat(session_id)
|
||||
logger.debug(f"Resumed heartbeat monitoring for session {session_id}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_status(self, session_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Gets the heartbeat status for a session.
|
||||
|
||||
Args:
|
||||
session_id: The session ID
|
||||
|
||||
Returns:
|
||||
Status dict or None if not registered
|
||||
"""
|
||||
if session_id not in self.sessions:
|
||||
return None
|
||||
|
||||
entry = self.sessions[session_id]
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"agent_type": entry.agent_type,
|
||||
"last_beat": entry.last_beat.isoformat(),
|
||||
"seconds_since_beat": (now - entry.last_beat).total_seconds(),
|
||||
"missed_beats": entry.missed_beats,
|
||||
"is_paused": session_id in self._paused_sessions,
|
||||
"is_healthy": entry.missed_beats == 0
|
||||
}
|
||||
|
||||
def get_all_status(self) -> Dict[str, Dict]:
|
||||
"""
|
||||
Gets heartbeat status for all registered sessions.
|
||||
|
||||
Returns:
|
||||
Dict mapping session_id to status dict
|
||||
"""
|
||||
return {
|
||||
session_id: self.get_status(session_id)
|
||||
for session_id in self.sessions
|
||||
}
|
||||
|
||||
@property
|
||||
def registered_count(self) -> int:
|
||||
"""Returns the number of registered sessions"""
|
||||
return len(self.sessions)
|
||||
|
||||
@property
|
||||
def healthy_count(self) -> int:
|
||||
"""Returns the number of healthy sessions (no missed beats)"""
|
||||
return sum(
|
||||
1 for entry in self.sessions.values()
|
||||
if entry.missed_beats == 0
|
||||
)
|
||||
|
||||
|
||||
class HeartbeatClient:
|
||||
"""
|
||||
Client-side heartbeat sender for agents.
|
||||
|
||||
Usage:
|
||||
client = HeartbeatClient(session_id, heartbeat_url)
|
||||
await client.start()
|
||||
# ... agent work ...
|
||||
await client.stop()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session_id: str,
|
||||
monitor: Optional[HeartbeatMonitor] = None,
|
||||
interval_seconds: int = 10
|
||||
):
|
||||
"""
|
||||
Initialize the heartbeat client.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to send heartbeats for
|
||||
monitor: Optional local HeartbeatMonitor (for in-process agents)
|
||||
interval_seconds: How often to send heartbeats
|
||||
"""
|
||||
self.session_id = session_id
|
||||
self.monitor = monitor
|
||||
self.interval = interval_seconds
|
||||
self._running = False
|
||||
self._task: Optional[asyncio.Task] = None
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Starts sending heartbeats"""
|
||||
if self._running:
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._task = asyncio.create_task(self._heartbeat_loop())
|
||||
logger.debug(f"Heartbeat client started for session {self.session_id}")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stops sending heartbeats"""
|
||||
self._running = False
|
||||
if self._task:
|
||||
self._task.cancel()
|
||||
try:
|
||||
await self._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._task = None
|
||||
logger.debug(f"Heartbeat client stopped for session {self.session_id}")
|
||||
|
||||
async def _heartbeat_loop(self) -> None:
|
||||
"""Main heartbeat sending loop"""
|
||||
while self._running:
|
||||
try:
|
||||
await self._send_heartbeat()
|
||||
await asyncio.sleep(self.interval)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending heartbeat: {e}")
|
||||
await asyncio.sleep(self.interval)
|
||||
|
||||
async def _send_heartbeat(self) -> None:
|
||||
"""Sends a single heartbeat"""
|
||||
if self.monitor:
|
||||
# Local monitor
|
||||
self.monitor.beat(self.session_id)
|
||||
# Future: Add HTTP-based heartbeat for distributed agents
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Context manager entry"""
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
await self.stop()
|
||||
Reference in New Issue
Block a user