aboutsummaryrefslogtreecommitdiff
path: root/cogs/alerts.py
blob: 9e2c71c481db1d3672c02e005d203901d5b38daf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import discord
from discord.ext import commands, tasks
from typing import Dict, Set
from proxmoxer import ProxmoxAPI
import config


class Alerts(commands.Cog):
    """Background monitoring and DM alerts."""

    def __init__(self, bot: commands.Bot):
        self.bot = bot
        # Track alert states to prevent spam
        self._cpu_alert_active = False
        self._memory_alert_active = False
        self._stopped_vms: Set[int] = set()  # VMIDs that were reported as stopped
        self._last_known_running: Set[int] = set()  # VMIDs that were running last check
        self._seen_backup_tasks: Set[str] = set()  # UPIDs of already-notified backup tasks

    def get_proxmox(self) -> ProxmoxAPI:
        """Get a Proxmox API connection."""
        return ProxmoxAPI(
            config.PROXMOX_HOST,
            user=config.PROXMOX_USER,
            token_name=config.PROXMOX_TOKEN_NAME,
            token_value=config.PROXMOX_TOKEN_VALUE,
            verify_ssl=config.PROXMOX_VERIFY_SSL,
        )

    async def cog_load(self):
        """Called when cog is loaded."""
        self.monitor_loop.start()

    async def cog_unload(self):
        """Called when cog is unloaded."""
        self.monitor_loop.cancel()

    async def _send_alert(self, message: str, embed: discord.Embed = None):
        """Send an alert DM to the owner."""
        try:
            owner = await self.bot.fetch_user(config.OWNER_ID)
            if embed:
                await owner.send(content=message, embed=embed)
            else:
                await owner.send(message)
        except Exception as e:
            print(f"Failed to send alert: {e}")

    @tasks.loop(seconds=60)
    async def monitor_loop(self):
        """Background task that checks system status."""
        try:
            proxmox = self.get_proxmox()
            await self._check_node_resources(proxmox)
            await self._check_vm_status(proxmox)
            await self._check_backup_tasks(proxmox)
        except Exception as e:
            print(f"Monitor loop error: {e}")

    @monitor_loop.before_loop
    async def before_monitor_loop(self):
        """Wait for bot to be ready before starting loop."""
        await self.bot.wait_until_ready()
        # Initialize known running VMs
        try:
            proxmox = self.get_proxmox()
            self._last_known_running = await self._get_running_vmids(proxmox)
        except Exception:
            pass

    async def _get_running_vmids(self, proxmox: ProxmoxAPI) -> Set[int]:
        """Get set of all running VM/LXC IDs."""
        running = set()
        try:
            vms = proxmox.nodes(config.PROXMOX_NODE).qemu.get()
            for vm in vms:
                if vm.get("status") == "running":
                    running.add(vm.get("vmid"))
        except Exception:
            pass

        try:
            lxcs = proxmox.nodes(config.PROXMOX_NODE).lxc.get()
            for lxc in lxcs:
                if lxc.get("status") == "running":
                    running.add(lxc.get("vmid"))
        except Exception:
            pass

        return running

    async def _check_node_resources(self, proxmox: ProxmoxAPI):
        """Check CPU and memory usage, alert if above threshold."""
        try:
            node_status = proxmox.nodes(config.PROXMOX_NODE).status.get()

            # CPU check
            cpu_percent = node_status["cpu"] * 100
            if cpu_percent >= config.ALERT_CPU_THRESHOLD:
                if not self._cpu_alert_active:
                    self._cpu_alert_active = True
                    embed = discord.Embed(
                        title=":warning: High CPU Usage Alert",
                        description=f"CPU usage on **{config.PROXMOX_NODE}** is at **{cpu_percent:.1f}%**",
                        color=discord.Color.orange(),
                    )
                    await self._send_alert("", embed=embed)
            else:
                self._cpu_alert_active = False

            # Memory check
            mem_used = node_status["memory"]["used"]
            mem_total = node_status["memory"]["total"]
            mem_percent = (mem_used / mem_total) * 100 if mem_total > 0 else 0

            if mem_percent >= config.ALERT_MEMORY_THRESHOLD:
                if not self._memory_alert_active:
                    self._memory_alert_active = True
                    embed = discord.Embed(
                        title=":warning: High Memory Usage Alert",
                        description=f"Memory usage on **{config.PROXMOX_NODE}** is at **{mem_percent:.1f}%**\n"
                        f"({mem_used / (1024**3):.1f} GB / {mem_total / (1024**3):.1f} GB)",
                        color=discord.Color.orange(),
                    )
                    await self._send_alert("", embed=embed)
            else:
                self._memory_alert_active = False

        except Exception as e:
            print(f"Resource check error: {e}")

    async def _check_vm_status(self, proxmox: ProxmoxAPI):
        """Check for VMs/LXCs that have stopped unexpectedly."""
        try:
            current_running = await self._get_running_vmids(proxmox)

            # Find VMs that were running but are now stopped
            newly_stopped = self._last_known_running - current_running

            for vmid in newly_stopped:
                if vmid not in self._stopped_vms:
                    self._stopped_vms.add(vmid)
                    # Try to get VM name
                    vm_name = f"ID {vmid}"
                    try:
                        # Check if it's a QEMU VM
                        vms = proxmox.nodes(config.PROXMOX_NODE).qemu.get()
                        for vm in vms:
                            if vm.get("vmid") == vmid:
                                vm_name = vm.get("name", vm_name)
                                break
                        else:
                            # Check LXC
                            lxcs = proxmox.nodes(config.PROXMOX_NODE).lxc.get()
                            for lxc in lxcs:
                                if lxc.get("vmid") == vmid:
                                    vm_name = lxc.get("name", vm_name)
                                    break
                    except Exception:
                        pass

                    embed = discord.Embed(
                        title=":red_circle: VM/Container Stopped",
                        description=f"**{vm_name}** (VMID: {vmid}) has stopped running.",
                        color=discord.Color.red(),
                    )
                    await self._send_alert("", embed=embed)

            # Clear stopped alerts for VMs that are running again
            self._stopped_vms = self._stopped_vms - current_running

            # Update last known state
            self._last_known_running = current_running

        except Exception as e:
            print(f"VM status check error: {e}")

    async def _check_backup_tasks(self, proxmox: ProxmoxAPI):
        """Check for completed backup tasks."""
        try:
            tasks = proxmox.nodes(config.PROXMOX_NODE).tasks.get()

            for task in tasks:
                # Look for vzdump (backup) tasks
                if task.get("type") == "vzdump" and task.get("status") == "OK":
                    upid = task.get("upid")
                    if upid and upid not in self._seen_backup_tasks:
                        self._seen_backup_tasks.add(upid)
                        # Keep set from growing too large
                        if len(self._seen_backup_tasks) > 100:
                            self._seen_backup_tasks = set(list(self._seen_backup_tasks)[-50:])

                        embed = discord.Embed(
                            title=":white_check_mark: Backup Completed",
                            description=f"Backup task completed successfully.\n"
                            f"**Node:** {task.get('node', 'unknown')}\n"
                            f"**Started:** {task.get('starttime', 'unknown')}\n"
                            f"**Ended:** {task.get('endtime', 'unknown')}",
                            color=discord.Color.green(),
                        )
                        await self._send_alert("", embed=embed)

        except Exception as e:
            print(f"Backup check error: {e}")


async def setup(bot: commands.Bot):
    await bot.add_cog(Alerts(bot))