diff options
Diffstat (limited to 'cogs/alerts.py')
| -rw-r--r-- | cogs/alerts.py | 208 |
1 files changed, 208 insertions, 0 deletions
diff --git a/cogs/alerts.py b/cogs/alerts.py new file mode 100644 index 0000000..9e2c71c --- /dev/null +++ b/cogs/alerts.py @@ -0,0 +1,208 @@ +import discord +from discord.ext import commands, tasks +from typing import Dict, Set +from proxmoxer import ProxmoxAPI +import config + + +class Alerts(commands.Cog): + """Background monitoring and DM alerts.""" + + def __init__(self, bot: commands.Bot): + self.bot = bot + # Track alert states to prevent spam + self._cpu_alert_active = False + self._memory_alert_active = False + self._stopped_vms: Set[int] = set() # VMIDs that were reported as stopped + self._last_known_running: Set[int] = set() # VMIDs that were running last check + self._seen_backup_tasks: Set[str] = set() # UPIDs of already-notified backup tasks + + def get_proxmox(self) -> ProxmoxAPI: + """Get a Proxmox API connection.""" + return ProxmoxAPI( + config.PROXMOX_HOST, + user=config.PROXMOX_USER, + token_name=config.PROXMOX_TOKEN_NAME, + token_value=config.PROXMOX_TOKEN_VALUE, + verify_ssl=config.PROXMOX_VERIFY_SSL, + ) + + async def cog_load(self): + """Called when cog is loaded.""" + self.monitor_loop.start() + + async def cog_unload(self): + """Called when cog is unloaded.""" + self.monitor_loop.cancel() + + async def _send_alert(self, message: str, embed: discord.Embed = None): + """Send an alert DM to the owner.""" + try: + owner = await self.bot.fetch_user(config.OWNER_ID) + if embed: + await owner.send(content=message, embed=embed) + else: + await owner.send(message) + except Exception as e: + print(f"Failed to send alert: {e}") + + @tasks.loop(seconds=60) + async def monitor_loop(self): + """Background task that checks system status.""" + try: + proxmox = self.get_proxmox() + await self._check_node_resources(proxmox) + await self._check_vm_status(proxmox) + await self._check_backup_tasks(proxmox) + except Exception as e: + print(f"Monitor loop error: {e}") + + @monitor_loop.before_loop + async def before_monitor_loop(self): + """Wait for bot to be ready before starting loop.""" + await self.bot.wait_until_ready() + # Initialize known running VMs + try: + proxmox = self.get_proxmox() + self._last_known_running = await self._get_running_vmids(proxmox) + except Exception: + pass + + async def _get_running_vmids(self, proxmox: ProxmoxAPI) -> Set[int]: + """Get set of all running VM/LXC IDs.""" + running = set() + try: + vms = proxmox.nodes(config.PROXMOX_NODE).qemu.get() + for vm in vms: + if vm.get("status") == "running": + running.add(vm.get("vmid")) + except Exception: + pass + + try: + lxcs = proxmox.nodes(config.PROXMOX_NODE).lxc.get() + for lxc in lxcs: + if lxc.get("status") == "running": + running.add(lxc.get("vmid")) + except Exception: + pass + + return running + + async def _check_node_resources(self, proxmox: ProxmoxAPI): + """Check CPU and memory usage, alert if above threshold.""" + try: + node_status = proxmox.nodes(config.PROXMOX_NODE).status.get() + + # CPU check + cpu_percent = node_status["cpu"] * 100 + if cpu_percent >= config.ALERT_CPU_THRESHOLD: + if not self._cpu_alert_active: + self._cpu_alert_active = True + embed = discord.Embed( + title=":warning: High CPU Usage Alert", + description=f"CPU usage on **{config.PROXMOX_NODE}** is at **{cpu_percent:.1f}%**", + color=discord.Color.orange(), + ) + await self._send_alert("", embed=embed) + else: + self._cpu_alert_active = False + + # Memory check + mem_used = node_status["memory"]["used"] + mem_total = node_status["memory"]["total"] + mem_percent = (mem_used / mem_total) * 100 if mem_total > 0 else 0 + + if mem_percent >= config.ALERT_MEMORY_THRESHOLD: + if not self._memory_alert_active: + self._memory_alert_active = True + embed = discord.Embed( + title=":warning: High Memory Usage Alert", + description=f"Memory usage on **{config.PROXMOX_NODE}** is at **{mem_percent:.1f}%**\n" + f"({mem_used / (1024**3):.1f} GB / {mem_total / (1024**3):.1f} GB)", + color=discord.Color.orange(), + ) + await self._send_alert("", embed=embed) + else: + self._memory_alert_active = False + + except Exception as e: + print(f"Resource check error: {e}") + + async def _check_vm_status(self, proxmox: ProxmoxAPI): + """Check for VMs/LXCs that have stopped unexpectedly.""" + try: + current_running = await self._get_running_vmids(proxmox) + + # Find VMs that were running but are now stopped + newly_stopped = self._last_known_running - current_running + + for vmid in newly_stopped: + if vmid not in self._stopped_vms: + self._stopped_vms.add(vmid) + # Try to get VM name + vm_name = f"ID {vmid}" + try: + # Check if it's a QEMU VM + vms = proxmox.nodes(config.PROXMOX_NODE).qemu.get() + for vm in vms: + if vm.get("vmid") == vmid: + vm_name = vm.get("name", vm_name) + break + else: + # Check LXC + lxcs = proxmox.nodes(config.PROXMOX_NODE).lxc.get() + for lxc in lxcs: + if lxc.get("vmid") == vmid: + vm_name = lxc.get("name", vm_name) + break + except Exception: + pass + + embed = discord.Embed( + title=":red_circle: VM/Container Stopped", + description=f"**{vm_name}** (VMID: {vmid}) has stopped running.", + color=discord.Color.red(), + ) + await self._send_alert("", embed=embed) + + # Clear stopped alerts for VMs that are running again + self._stopped_vms = self._stopped_vms - current_running + + # Update last known state + self._last_known_running = current_running + + except Exception as e: + print(f"VM status check error: {e}") + + async def _check_backup_tasks(self, proxmox: ProxmoxAPI): + """Check for completed backup tasks.""" + try: + tasks = proxmox.nodes(config.PROXMOX_NODE).tasks.get() + + for task in tasks: + # Look for vzdump (backup) tasks + if task.get("type") == "vzdump" and task.get("status") == "OK": + upid = task.get("upid") + if upid and upid not in self._seen_backup_tasks: + self._seen_backup_tasks.add(upid) + # Keep set from growing too large + if len(self._seen_backup_tasks) > 100: + self._seen_backup_tasks = set(list(self._seen_backup_tasks)[-50:]) + + embed = discord.Embed( + title=":white_check_mark: Backup Completed", + description=f"Backup task completed successfully.\n" + f"**Node:** {task.get('node', 'unknown')}\n" + f"**Started:** {task.get('starttime', 'unknown')}\n" + f"**Ended:** {task.get('endtime', 'unknown')}", + color=discord.Color.green(), + ) + await self._send_alert("", embed=embed) + + except Exception as e: + print(f"Backup check error: {e}") + + +async def setup(bot: commands.Bot): + await bot.add_cog(Alerts(bot)) |
