Files
unpoller-unpoller-3/alerts/prometheus/unifi-alerts.yaml

235 lines
8.6 KiB
YAML

# Prometheus alerting rules for UniFi infrastructure (unPoller metrics)
# Default namespace: unpoller. Adjust metric names if using a custom namespace.
groups:
- name: unifi-ups
rules:
- alert: UnifiUPSLowBattery
expr: unpoller_device_ups_battery_level_percent < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Low UPS battery on {{ $labels.device_name }}"
description: "UPS {{ $labels.device_name }} at {{ $value }}% battery (site: {{ $labels.site_name }})"
- alert: UnifiUPSCriticalBattery
expr: unpoller_device_ups_battery_level_percent < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Critical UPS battery on {{ $labels.device_name }}"
description: "UPS {{ $labels.device_name }} at {{ $value }}% battery - prepare for shutdown"
- alert: UnifiUPSOnBattery
expr: unpoller_device_ups_battery_mode == 1
for: 1m
labels:
severity: warning
annotations:
summary: "UPS {{ $labels.device_name }} running on battery"
description: "Power outage or AC loss - UPS {{ $labels.device_name }} on battery (site: {{ $labels.site_name }})"
- alert: UnifiUPSLowRuntime
expr: unpoller_device_ups_battery_time_remaining_seconds < 300 and unpoller_device_ups_battery_time_remaining_seconds >= 0
for: 5m
labels:
severity: warning
annotations:
summary: "Low UPS runtime on {{ $labels.device_name }}"
description: "UPS {{ $labels.device_name }} has {{ $value | humanizeDuration }} runtime remaining"
- alert: UnifiUPSHighLoad
expr: unpoller_device_ups_load_percent > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High UPS load on {{ $labels.device_name }}"
description: "UPS {{ $labels.device_name }} load at {{ $value }}% of capacity"
- alert: UnifiUPSBMSAnomaly
expr: unpoller_device_ups_bms_anomaly_count > 0
for: 5m
labels:
severity: warning
annotations:
summary: "UPS BMS anomaly on {{ $labels.device_name }}"
description: "Battery management anomaly detected on {{ $labels.device_name }}"
- alert: UnifiUPSNotCharging
expr: unpoller_device_ups_battery_charging == 0 and unpoller_device_ups_battery_level_percent < 100
for: 30m
labels:
severity: warning
annotations:
summary: "UPS {{ $labels.device_name }} not charging"
description: "Battery at {{ $value }}% but not charging - check power or battery health"
- name: unifi-controller
rules:
- alert: UnifiControllerUpdateAvailable
expr: unpoller_controller_update_available == 1
for: 1h
labels:
severity: info
annotations:
summary: "UniFi controller update available ({{ $labels.hostname }})"
description: "Controller {{ $labels.hostname }} has an update available"
- alert: UnifiControllerUnsupportedDevices
expr: unpoller_controller_unsupported_device_count > 0
for: 1h
labels:
severity: warning
annotations:
summary: "Unsupported devices on controller {{ $labels.hostname }}"
description: "{{ $value }} unsupported device(s) on controller"
- name: unifi-devices
rules:
- alert: UnifiDeviceHighCPU
expr: unpoller_device_cpu_utilization_ratio > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU on {{ $labels.name }}"
description: "Device {{ $labels.name }} ({{ $labels.type }}) CPU at {{ $value | humanize }}"
- alert: UnifiDeviceHighMemory
expr: unpoller_device_memory_utilization_ratio > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High memory on {{ $labels.name }}"
description: "Device {{ $labels.name }} memory at {{ $value | humanize }}"
- alert: UnifiDeviceUpgradeAvailable
expr: unpoller_device_upgradable == 1
for: 1h
labels:
severity: info
annotations:
summary: "Device {{ $labels.name }} has upgrade available"
description: "{{ $labels.type }} device {{ $labels.name }} has firmware update available (site: {{ $labels.site_name }})"
- name: unifi-site
rules:
- alert: UnifiSiteHighDisconnectedDevices
expr: unpoller_site_disconnected{subsystem=~"wlan|wan|lan"} > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Disconnected devices on {{ $labels.site_name }} ({{ $labels.subsystem }})"
description: "{{ $value }} device(s) disconnected in {{ $labels.subsystem }} subsystem"
- alert: UnifiSitePendingAdoptions
expr: unpoller_site_pending > 0
for: 1h
labels:
severity: info
annotations:
summary: "Pending device adoptions on {{ $labels.site_name }}"
description: "{{ $value }} device(s) pending adoption (subsystem: {{ $labels.subsystem }})"
- alert: UnifiSiteWANDrops
expr: increase(unpoller_site_intenet_drops_total[1h]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "WAN disconnections on {{ $labels.site_name }}"
description: "WAN dropped {{ $value }} time(s) in the last hour"
- alert: UnifiSiteHighLatency
expr: unpoller_site_latency_seconds{subsystem="www"} > 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "High internet latency on {{ $labels.site_name }}"
description: "Latency {{ $value }}s exceeds 500ms threshold"
- name: unifi-wan
rules:
- alert: UnifiWANLowUptime
expr: unpoller_wan_uptime_percentage < 95
for: 15m
labels:
severity: warning
annotations:
summary: "Low WAN uptime on {{ $labels.wan_name }}"
description: "WAN {{ $labels.wan_name }} uptime at {{ $value }}% (site: {{ $labels.site_name }})"
- alert: UnifiWANPeakDownloadUtilization
expr: unpoller_wan_peak_download_percent > 90
for: 10m
labels:
severity: info
annotations:
summary: "WAN download near capacity on {{ $labels.wan_name }}"
description: "Peak download at {{ $value }}% - consider upgrading or load balancing"
- alert: UnifiWANPeakUploadUtilization
expr: unpoller_wan_peak_upload_percent > 90
for: 10m
labels:
severity: info
annotations:
summary: "WAN upload near capacity on {{ $labels.wan_name }}"
description: "Peak upload at {{ $value }}% - consider upgrading or load balancing"
- name: unifi-dhcp
rules:
- alert: UnifiDHCPPoolExhaustion
expr: unpoller_dhcp_utilization_percent > 90
for: 15m
labels:
severity: warning
annotations:
summary: "DHCP pool nearly exhausted on {{ $labels.network }}"
description: "DHCP utilization at {{ $value }}% - expand pool or reduce lease time"
- alert: UnifiDHCPPoolCritical
expr: unpoller_dhcp_utilization_percent > 98
for: 5m
labels:
severity: critical
annotations:
summary: "DHCP pool critically low on {{ $labels.network }}"
description: "Utilization at {{ $value }}% - new devices may not get IP addresses"
- name: unifi-rogue
rules:
- alert: UnifiRogueAPDetected
expr: count(unpoller_rogueap_rssi) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Rogue AP detected on {{ $labels.site_name }}"
description: "Unauthorized access point(s) detected - review and take action"
- name: unifi-controller-health
rules:
- alert: UnifiControllerRecentlyRestarted
expr: unpoller_controller_uptime_seconds < 3600
for: 5m
labels:
severity: info
annotations:
summary: "Controller {{ $labels.hostname }} recently restarted"
description: "Uptime {{ $value | humanizeDuration }} - may indicate maintenance or crash"
- alert: UnifiControllerBackupDisabled
expr: unpoller_controller_autobackup_enabled == 0
for: 24h
labels:
severity: info
annotations:
summary: "Auto backup disabled on {{ $labels.hostname }}"
description: "Controller has automatic backups disabled - enable for disaster recovery"