From 5ea7fcf736b642f365067c48325431a47e7ad5a7 Mon Sep 17 00:00:00 2001 From: Brian Gates Date: Sat, 31 Jan 2026 21:25:58 -0500 Subject: [PATCH] feat: UPS battery metrics, example Prometheus/Loki alerts (unpoller#930) (#941) --- alerts/README.md | 210 +++++++++++++++++ alerts/loki/unifi-alerts.yaml | 92 ++++++++ alerts/prometheus/unifi-alerts.yaml | 234 +++++++++++++++++++ alerts/prometheus/unifi-recording-rules.yaml | 60 +++++ go.mod | 2 +- go.sum | 4 +- pkg/influxunifi/udm.go | 2 +- pkg/promunifi/pdu.go | 71 ++++++ 8 files changed, 671 insertions(+), 4 deletions(-) create mode 100644 alerts/README.md create mode 100644 alerts/loki/unifi-alerts.yaml create mode 100644 alerts/prometheus/unifi-alerts.yaml create mode 100644 alerts/prometheus/unifi-recording-rules.yaml diff --git a/alerts/README.md b/alerts/README.md new file mode 100644 index 00000000..1f314127 --- /dev/null +++ b/alerts/README.md @@ -0,0 +1,210 @@ +# UniFi Infrastructure Alerts + +Example Prometheus and Loki alerting rules for monitoring UniFi infrastructure with unPoller. + +## Overview + +- **Prometheus** – Metrics from devices, clients, UPS/PDU, controller, sites, WAN, DHCP, rogue APs, and more +- **Loki** – Logs for events, alarms, IDS, anomalies, and system logs + +These examples assume the default Prometheus namespace `unpoller`. Adjust metric names if you use a custom `prometheus.namespace`. + +--- + +## Prometheus Alerts + +Place `prometheus/unifi-alerts.yaml` in your Prometheus `rule_files` or Grafana Alerting. + +### UPS (unifi-ups) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiUPSLowBattery** | Battery level < 20% for 5m | warning | UPS needs attention; plan for charging or replacement | +| **UnifiUPSCriticalBattery** | Battery level < 10% for 2m | critical | UPS near depletion; prepare for shutdown | +| **UnifiUPSOnBattery** | Running on battery for 1m | warning | Power outage or AC loss; UPS sustaining load | +| **UnifiUPSLowRuntime** | Runtime < 5 min (and known) for 5m | warning | Little runtime left; prioritize critical loads | +| **UnifiUPSHighLoad** | Load > 80% of capacity for 10m | warning | UPS near capacity; consider load shedding | +| **UnifiUPSBMSAnomaly** | BMS anomaly count > 0 for 5m | warning | Battery management system issue; check UPS health | +| **UnifiUPSNotCharging** | Not charging and battery < 100% for 30m | warning | Battery not charging; check power or battery | + +*Requires: PDU/UPS devices with vbms_table (e.g. USW-DA-23-POE-UPS)* + +### Controller (unifi-controller) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiControllerUpdateAvailable** | Update available for 1h | info | Controller firmware update available | +| **UnifiControllerUnsupportedDevices** | Unsupported device count > 0 for 1h | warning | Devices no longer supported; plan upgrades | + +### Controller Health (unifi-controller-health) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiControllerRecentlyRestarted** | Uptime < 1h for 5m | info | Controller recently restarted; may indicate maintenance or crash | +| **UnifiControllerBackupDisabled** | Auto backup disabled for 24h | info | Backups disabled; enable for disaster recovery | + +### Devices (unifi-devices) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiDeviceHighCPU** | CPU > 90% for 10m | warning | Device under heavy load; investigate | +| **UnifiDeviceHighMemory** | Memory > 90% for 10m | warning | Device memory pressure; may impact performance | +| **UnifiDeviceUpgradeAvailable** | Firmware upgrade available for 1h | info | Device has firmware update available | + +### Site (unifi-site) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiSiteHighDisconnectedDevices** | Disconnected devices > 0 (WLAN/WAN/LAN) for 15m | warning | Devices offline; check power, connectivity, adoption | +| **UnifiSitePendingAdoptions** | Pending adoptions > 0 for 1h | info | Devices awaiting adoption | +| **UnifiSiteWANDrops** | WAN disconnections in last 1h > 0 | warning | Internet connectivity issues | +| **UnifiSiteHighLatency** | Internet latency > 500ms for 10m | warning | Poor internet performance | + +*Requires: save_sites=true* + +### WAN (unifi-wan) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiWANLowUptime** | WAN uptime < 95% for 15m | warning | WAN link unstable; check ISP or cabling | +| **UnifiWANPeakDownloadUtilization** | Peak download > 90% of capacity for 10m | info | Download near capacity; consider upgrade | +| **UnifiWANPeakUploadUtilization** | Peak upload > 90% of capacity for 10m | info | Upload near capacity; consider upgrade | + +*Requires: WAN metrics (UDM/UDM-Pro/UCG)* + +### DHCP (unifi-dhcp) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiDHCPPoolExhaustion** | Pool utilization > 90% for 15m | warning | DHCP pool nearly full; expand range or reduce lease time | +| **UnifiDHCPPoolCritical** | Pool utilization > 98% for 5m | critical | Pool almost exhausted; new devices may not get IPs | + +*Requires: save_dhcp or DHCP lease collection* + +### Rogue AP (unifi-rogue) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiRogueAPDetected** | Any rogue AP detected for 5m | warning | Unauthorized access point; investigate and remediate | + +*Requires: save_rogue=true* + +--- + +## Prometheus Recording Rules + +Place `prometheus/unifi-recording-rules.yaml` in your Prometheus `rule_files` to pre-compute aggregates for faster dashboards and simpler alerting. + +### UPS Recording Rules (interval: 1m) + +| Recorded Metric | Expression | Description | +|-----------------|------------|-------------| +| `unpoller:ups_on_battery:count` | Count of UPSes with battery_mode=1 by site | UPS devices running on battery per site | +| `unpoller:ups_min_battery_level_percent:min` | Min battery level by site | Worst battery level per site | +| `unpoller:ups_min_runtime_seconds:min` | Min runtime (≥0) by site | Worst runtime remaining per site | +| `unpoller:ups_total_power_output_watts:sum` | Sum of power output by site | Total UPS load per site | +| `unpoller:ups_total_power_budget_watts:sum` | Sum of power budget by site | Total UPS capacity per site | +| `unpoller:ups_bms_anomaly_count:sum` | Sum of devices with BMS anomaly by site | UPSes with BMS issues per site | + +### Device Recording Rules (interval: 1m) + +| Recorded Metric | Expression | Description | +|-----------------|------------|-------------| +| `unpoller:device_count:by_type` | Count of devices by type (uap, usw, pdu, etc.) per site | Device inventory by type | +| `unpoller:device_count:total` | Total device count per site | Total devices per site | +| `unpoller:device_high_cpu_count:count` | Count of devices with CPU > 90% per site | Overloaded devices per site | +| `unpoller:device_high_memory_count:count` | Count of devices with memory > 90% per site | Memory-pressure devices per site | + +### Controller Recording Rules (interval: 5m) + +| Recorded Metric | Expression | Description | +|-----------------|------------|-------------| +| `unpoller:controller_update_available:count` | Count of controllers with update available | Controllers needing updates | +| `unpoller:controller_unsupported_devices_total:sum` | Sum of unsupported devices | Total unsupported devices across controllers | + +--- + +## Loki Alerts + +Place `loki/unifi-alerts.yaml` in your Loki Ruler config. Loki must be run with `-ruler.enable=true` and `-ruler.storage.path` configured. + +### Alarms (unifi-alarms) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiHighAlarmRate** | > 20 alarms in 15m for 5m | warning | Elevated alarm volume; review controller | + +*Requires: save_alarms=true* + +### IDS (unifi-ids) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiIDSEvent** | Any IDS event in 5m for 1m | warning | Intrusion detection triggered; review logs | +| **UnifiIDSHighVolume** | > 50 IDS events in 1h for 5m | critical | High IDS volume; possible attack | + +*Requires: save_ids=true* + +### Anomalies (unifi-anomalies) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiAnomalyDetected** | > 5 anomalies in 10m for 5m | warning | Multiple anomalies; check network health | + +*Requires: save_anomalies=true* + +### System Log (unifi-system-log) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiSystemLogCritical** | Any CRITICAL log in 5m for 1m | critical | Critical system log; immediate attention | +| **UnifiSystemLogHighSeverity** | > 10 CRITICAL/HIGH/ERROR logs in 15m for 5m | warning | High volume of severe logs | +| **UnifiSystemLogAuthFailure** | > 5 auth failure matches in 1h for 5m | warning | Authentication failures; possible brute force | + +*Requires: save_syslog=true (UDM/UDM-Pro) or save_events=true (older controllers)* + +### Events (unifi-events) + +| Alert | Trigger | Severity | Description | +|-------|---------|----------|-------------| +| **UnifiEventSpike** | > 100 events in 5m for 5m | info | Event spike; may indicate churn or issue | + +*Requires: save_events=true* + +--- + +## Configuration + +**Prometheus (prometheus.yml):** + +```yaml +rule_files: + - /etc/prometheus/rules/unifi-alerts.yaml + - /etc/prometheus/rules/unifi-recording-rules.yaml +``` + +**Loki (loki-config.yaml):** + +```yaml +ruler: + enable_api: true + storage: + type: local + local: + directory: /loki/rules + rule_path: /loki/rules-temp + alertmanager_url: http://alertmanager:9093 +``` + +Mount the `loki/` directory into your Loki container at `/loki/rules/`. + +## AlertManager Integration + +Both Prometheus and Loki can forward alerts to Alertmanager. Configure Alertmanager receivers (Slack, PagerDuty, email, etc.) as needed. + +## Customization + +- Tune thresholds (battery %, runtime seconds, CPU %, etc.) for your environment +- Add or remove labels in `annotations` for your notification channels +- Adjust `for` durations to reduce noise or catch issues sooner +- Disable alert groups that don't apply (e.g. remove UPS alerts if you have no UPS devices) diff --git a/alerts/loki/unifi-alerts.yaml b/alerts/loki/unifi-alerts.yaml new file mode 100644 index 00000000..dba6c12c --- /dev/null +++ b/alerts/loki/unifi-alerts.yaml @@ -0,0 +1,92 @@ +# Loki alerting rules for UniFi logs (unPoller Loki output) +# Requires: save_alarms, save_ids, save_anomalies, save_events, and/or save_syslog enabled +groups: + - name: unifi-alarms + rules: + - alert: UnifiHighAlarmRate + expr: | + sum by (site_name, source) (count_over_time({application="unifi_alarm"}[15m])) > 20 + for: 5m + labels: + severity: warning + annotations: + summary: "High UniFi alarm rate" + description: "{{ $value }} alarms in 15m for site {{ $labels.site_name }}" + + - name: unifi-ids + rules: + - alert: UnifiIDSEvent + expr: | + count_over_time({application="unifi_ids"}[5m]) > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "UniFi IDS event detected" + description: "Intrusion detection event(s) from UniFi - review logs" + + - alert: UnifiIDSHighVolume + expr: | + sum by (site_name) (count_over_time({application="unifi_ids"}[1h])) > 50 + for: 5m + labels: + severity: critical + annotations: + summary: "High UniFi IDS event volume" + description: "{{ $value }} IDS events in 1h for site {{ $labels.site_name }} - possible attack" + + - name: unifi-anomalies + rules: + - alert: UnifiAnomalyDetected + expr: | + count_over_time({application="unifi_anomaly"}[10m]) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "UniFi anomalies detected" + description: "Multiple anomalies in 10m - review network health" + + - name: unifi-system-log + rules: + - alert: UnifiSystemLogCritical + expr: | + sum by (site_name, source) (count_over_time({application="unifi_system_log", severity="CRITICAL"}[5m])) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "UniFi CRITICAL system log on {{ $labels.site_name }}" + description: "{{ $value }} CRITICAL log(s) in 5m - immediate attention required" + + - alert: UnifiSystemLogHighSeverity + expr: | + sum by (site_name, severity) (count_over_time({application="unifi_system_log", severity=~"CRITICAL|HIGH|ERROR"}[15m])) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "High-severity UniFi system log volume" + description: "{{ $value }} {{ $labels.severity }} logs in 15m for site {{ $labels.site_name }}" + + - alert: UnifiSystemLogAuthFailure + expr: | + sum by (site_name) (count_over_time({application="unifi_system_log"} |~ "(?i)(login failed|auth failed|authentication failure)"[1h])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "UniFi authentication failures on {{ $labels.site_name }}" + description: "{{ $value }} auth failure(s) in 1h - possible brute force or credential leak" + + - name: unifi-events + rules: + - alert: UnifiEventSpike + expr: | + sum by (site_name) (count_over_time({application="unifi_event"}[5m])) > 100 + for: 5m + labels: + severity: info + annotations: + summary: "UniFi event spike" + description: "{{ $value }} events in 5m - may indicate network churn or issue" diff --git a/alerts/prometheus/unifi-alerts.yaml b/alerts/prometheus/unifi-alerts.yaml new file mode 100644 index 00000000..f088bc41 --- /dev/null +++ b/alerts/prometheus/unifi-alerts.yaml @@ -0,0 +1,234 @@ +# Prometheus alerting rules for UniFi infrastructure (unPoller metrics) +# Default namespace: unpoller. Adjust metric names if using a custom namespace. +groups: + - name: unifi-ups + rules: + - alert: UnifiUPSLowBattery + expr: unpoller_device_ups_battery_level_percent < 20 + for: 5m + labels: + severity: warning + annotations: + summary: "Low UPS battery on {{ $labels.device_name }}" + description: "UPS {{ $labels.device_name }} at {{ $value }}% battery (site: {{ $labels.site_name }})" + + - alert: UnifiUPSCriticalBattery + expr: unpoller_device_ups_battery_level_percent < 10 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical UPS battery on {{ $labels.device_name }}" + description: "UPS {{ $labels.device_name }} at {{ $value }}% battery - prepare for shutdown" + + - alert: UnifiUPSOnBattery + expr: unpoller_device_ups_battery_mode == 1 + for: 1m + labels: + severity: warning + annotations: + summary: "UPS {{ $labels.device_name }} running on battery" + description: "Power outage or AC loss - UPS {{ $labels.device_name }} on battery (site: {{ $labels.site_name }})" + + - alert: UnifiUPSLowRuntime + expr: unpoller_device_ups_battery_time_remaining_seconds < 300 and unpoller_device_ups_battery_time_remaining_seconds >= 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Low UPS runtime on {{ $labels.device_name }}" + description: "UPS {{ $labels.device_name }} has {{ $value | humanizeDuration }} runtime remaining" + + - alert: UnifiUPSHighLoad + expr: unpoller_device_ups_load_percent > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "High UPS load on {{ $labels.device_name }}" + description: "UPS {{ $labels.device_name }} load at {{ $value }}% of capacity" + + - alert: UnifiUPSBMSAnomaly + expr: unpoller_device_ups_bms_anomaly_count > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "UPS BMS anomaly on {{ $labels.device_name }}" + description: "Battery management anomaly detected on {{ $labels.device_name }}" + + - alert: UnifiUPSNotCharging + expr: unpoller_device_ups_battery_charging == 0 and unpoller_device_ups_battery_level_percent < 100 + for: 30m + labels: + severity: warning + annotations: + summary: "UPS {{ $labels.device_name }} not charging" + description: "Battery at {{ $value }}% but not charging - check power or battery health" + + - name: unifi-controller + rules: + - alert: UnifiControllerUpdateAvailable + expr: unpoller_controller_update_available == 1 + for: 1h + labels: + severity: info + annotations: + summary: "UniFi controller update available ({{ $labels.hostname }})" + description: "Controller {{ $labels.hostname }} has an update available" + + - alert: UnifiControllerUnsupportedDevices + expr: unpoller_controller_unsupported_device_count > 0 + for: 1h + labels: + severity: warning + annotations: + summary: "Unsupported devices on controller {{ $labels.hostname }}" + description: "{{ $value }} unsupported device(s) on controller" + + - name: unifi-devices + rules: + - alert: UnifiDeviceHighCPU + expr: unpoller_device_cpu_utilization_ratio > 0.9 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU on {{ $labels.name }}" + description: "Device {{ $labels.name }} ({{ $labels.type }}) CPU at {{ $value | humanize }}" + + - alert: UnifiDeviceHighMemory + expr: unpoller_device_memory_utilization_ratio > 0.9 + for: 10m + labels: + severity: warning + annotations: + summary: "High memory on {{ $labels.name }}" + description: "Device {{ $labels.name }} memory at {{ $value | humanize }}" + + - alert: UnifiDeviceUpgradeAvailable + expr: unpoller_device_upgradable == 1 + for: 1h + labels: + severity: info + annotations: + summary: "Device {{ $labels.name }} has upgrade available" + description: "{{ $labels.type }} device {{ $labels.name }} has firmware update available (site: {{ $labels.site_name }})" + + - name: unifi-site + rules: + - alert: UnifiSiteHighDisconnectedDevices + expr: unpoller_site_disconnected{subsystem=~"wlan|wan|lan"} > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "Disconnected devices on {{ $labels.site_name }} ({{ $labels.subsystem }})" + description: "{{ $value }} device(s) disconnected in {{ $labels.subsystem }} subsystem" + + - alert: UnifiSitePendingAdoptions + expr: unpoller_site_pending > 0 + for: 1h + labels: + severity: info + annotations: + summary: "Pending device adoptions on {{ $labels.site_name }}" + description: "{{ $value }} device(s) pending adoption (subsystem: {{ $labels.subsystem }})" + + - alert: UnifiSiteWANDrops + expr: increase(unpoller_site_intenet_drops_total[1h]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "WAN disconnections on {{ $labels.site_name }}" + description: "WAN dropped {{ $value }} time(s) in the last hour" + + - alert: UnifiSiteHighLatency + expr: unpoller_site_latency_seconds{subsystem="www"} > 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "High internet latency on {{ $labels.site_name }}" + description: "Latency {{ $value }}s exceeds 500ms threshold" + + - name: unifi-wan + rules: + - alert: UnifiWANLowUptime + expr: unpoller_wan_uptime_percentage < 95 + for: 15m + labels: + severity: warning + annotations: + summary: "Low WAN uptime on {{ $labels.wan_name }}" + description: "WAN {{ $labels.wan_name }} uptime at {{ $value }}% (site: {{ $labels.site_name }})" + + - alert: UnifiWANPeakDownloadUtilization + expr: unpoller_wan_peak_download_percent > 90 + for: 10m + labels: + severity: info + annotations: + summary: "WAN download near capacity on {{ $labels.wan_name }}" + description: "Peak download at {{ $value }}% - consider upgrading or load balancing" + + - alert: UnifiWANPeakUploadUtilization + expr: unpoller_wan_peak_upload_percent > 90 + for: 10m + labels: + severity: info + annotations: + summary: "WAN upload near capacity on {{ $labels.wan_name }}" + description: "Peak upload at {{ $value }}% - consider upgrading or load balancing" + + - name: unifi-dhcp + rules: + - alert: UnifiDHCPPoolExhaustion + expr: unpoller_dhcp_utilization_percent > 90 + for: 15m + labels: + severity: warning + annotations: + summary: "DHCP pool nearly exhausted on {{ $labels.network }}" + description: "DHCP utilization at {{ $value }}% - expand pool or reduce lease time" + + - alert: UnifiDHCPPoolCritical + expr: unpoller_dhcp_utilization_percent > 98 + for: 5m + labels: + severity: critical + annotations: + summary: "DHCP pool critically low on {{ $labels.network }}" + description: "Utilization at {{ $value }}% - new devices may not get IP addresses" + + - name: unifi-rogue + rules: + - alert: UnifiRogueAPDetected + expr: count(unpoller_rogueap_rssi) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Rogue AP detected on {{ $labels.site_name }}" + description: "Unauthorized access point(s) detected - review and take action" + + - name: unifi-controller-health + rules: + - alert: UnifiControllerRecentlyRestarted + expr: unpoller_controller_uptime_seconds < 3600 + for: 5m + labels: + severity: info + annotations: + summary: "Controller {{ $labels.hostname }} recently restarted" + description: "Uptime {{ $value | humanizeDuration }} - may indicate maintenance or crash" + + - alert: UnifiControllerBackupDisabled + expr: unpoller_controller_autobackup_enabled == 0 + for: 24h + labels: + severity: info + annotations: + summary: "Auto backup disabled on {{ $labels.hostname }}" + description: "Controller has automatic backups disabled - enable for disaster recovery" diff --git a/alerts/prometheus/unifi-recording-rules.yaml b/alerts/prometheus/unifi-recording-rules.yaml new file mode 100644 index 00000000..f633db6b --- /dev/null +++ b/alerts/prometheus/unifi-recording-rules.yaml @@ -0,0 +1,60 @@ +# Prometheus recording rules for UniFi infrastructure (unPoller metrics) +# Pre-compute aggregates for dashboards and simpler alerting. +# Default namespace: unpoller. Adjust metric names if using a custom namespace. +groups: + - name: unifi-ups-recording + interval: 1m + rules: + # Count of UPS devices running on battery per site (for "any UPS on battery" alerts/dashboards) + - record: unpoller:ups_on_battery:count + expr: count by (site_name, source) (unpoller_device_ups_battery_mode == 1) + + # Minimum battery level across UPSes per site (worst-case for multi-UPS sites) + - record: unpoller:ups_min_battery_level_percent:min + expr: min by (site_name, source) (unpoller_device_ups_battery_level_percent) + + # Minimum runtime remaining across UPSes per site (worst-case; excludes -1 unknown) + - record: unpoller:ups_min_runtime_seconds:min + expr: min by (site_name, source) (unpoller_device_ups_battery_time_remaining_seconds >= 0) + + # Total power output across UPSes per site (capacity planning) + - record: unpoller:ups_total_power_output_watts:sum + expr: sum by (site_name, source) (unpoller_device_ups_power_output_watts) + + # Total power budget across UPSes per site + - record: unpoller:ups_total_power_budget_watts:sum + expr: sum by (site_name, source) (unpoller_device_ups_power_budget_watts) + + # Count of UPS devices with BMS anomalies per site + - record: unpoller:ups_bms_anomaly_count:sum + expr: sum by (site_name, source) (unpoller_device_ups_bms_anomaly_count > 0) + + - name: unifi-devices-recording + interval: 1m + rules: + # Device count by type per site (capacity dashboards) + - record: unpoller:device_count:by_type + expr: count by (site_name, source, type) (unpoller_device_info) + + # Total device count per site + - record: unpoller:device_count:total + expr: count by (site_name, source) (unpoller_device_info) + + # Count of devices with high CPU per site + - record: unpoller:device_high_cpu_count:count + expr: count by (site_name, source) (unpoller_device_cpu_utilization_ratio > 0.9) + + # Count of devices with high memory per site + - record: unpoller:device_high_memory_count:count + expr: count by (site_name, source) (unpoller_device_memory_utilization_ratio > 0.9) + + - name: unifi-controller-recording + interval: 5m + rules: + # Count of controllers with updates available (multi-controller orgs) + - record: unpoller:controller_update_available:count + expr: count(unpoller_controller_update_available == 1) + + # Total unsupported devices across all controllers + - record: unpoller:controller_unsupported_devices_total:sum + expr: sum(unpoller_controller_unsupported_device_count) diff --git a/go.mod b/go.mod index 66359e26..c5ac80d5 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/prometheus/common v0.67.5 github.com/spf13/pflag v1.0.10 github.com/stretchr/testify v1.11.1 - github.com/unpoller/unifi/v5 v5.15.0 + github.com/unpoller/unifi/v5 v5.16.0 golang.org/x/crypto v0.47.0 golang.org/x/term v0.39.0 golift.io/cnfg v0.2.3 diff --git a/go.sum b/go.sum index 95754130..49f7194e 100644 --- a/go.sum +++ b/go.sum @@ -77,8 +77,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -github.com/unpoller/unifi/v5 v5.15.0 h1:9xYBmboWBcY4Cv8ARbWMjBlAUNVlG7TIuX+aRf6mcUE= -github.com/unpoller/unifi/v5 v5.15.0/go.mod h1:vSIXIclPG9dpKxUp+pavfgENHWaTZXvDg7F036R1YCo= +github.com/unpoller/unifi/v5 v5.16.0 h1:FowfkJ7wbMoySFcqOJG2IJH9pOGTUnPpKNNG9vHl2/I= +github.com/unpoller/unifi/v5 v5.16.0/go.mod h1:vSIXIclPG9dpKxUp+pavfgENHWaTZXvDg7F036R1YCo= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= diff --git a/pkg/influxunifi/udm.go b/pkg/influxunifi/udm.go index c8d7da7d..3feaee3e 100644 --- a/pkg/influxunifi/udm.go +++ b/pkg/influxunifi/udm.go @@ -55,7 +55,7 @@ func (u *InfluxUnifi) batchUDMtemps(temps []unifi.Temperature) map[string]any { output := make(map[string]any) for _, t := range temps { - output["temp_"+sanitizeName(t.Name)] = t.Value + output["temp_"+sanitizeName(t.Name)] = int64(t.Value) } return output diff --git a/pkg/promunifi/pdu.go b/pkg/promunifi/pdu.go index 71268ec5..b9b42b63 100644 --- a/pkg/promunifi/pdu.go +++ b/pkg/promunifi/pdu.go @@ -58,6 +58,21 @@ type pdu struct { OutletPower *prometheus.Desc OutletPowerFactor *prometheus.Desc OutletVoltage *prometheus.Desc + // UPS battery health (vbms_table) + BatteryLevelPercent *prometheus.Desc + BatteryTimeRemaining *prometheus.Desc + BatteryCharging *prometheus.Desc + BatteryMode *prometheus.Desc + BatteriesAvailable *prometheus.Desc + BatteriesReady *prometheus.Desc + PowerBudgetWatts *prometheus.Desc + PowerOutputWatts *prometheus.Desc + PowerFactor *prometheus.Desc + OutputVoltage *prometheus.Desc + OutputCurrentAmps *prometheus.Desc + LoadPercent *prometheus.Desc + BMSAnomalyCount *prometheus.Desc + PowerCycleOnRecoveryEnabled *prometheus.Desc } func descPDU(ns string) *pdu { @@ -73,6 +88,7 @@ func descPDU(ns string) *pdu { labelO := []string{ "outlet_description", "outlet_index", "outlet_name", "site_name", "name", "source", "tag", } + labelUPS := []string{"site_name", "device_name", "device_mac", "source", "tag"} nd := prometheus.NewDesc return &pdu{ @@ -128,6 +144,21 @@ func descPDU(ns string) *pdu { OutletPower: nd(outlet+"outlet_power", "Outlet Power", labelO, nil), OutletPowerFactor: nd(outlet+"outlet_power_factor", "Outlet Power Factor", labelO, nil), OutletVoltage: nd(outlet+"outlet_voltage", "Outlet Voltage", labelO, nil), + // UPS battery health (vbms_table) + BatteryLevelPercent: nd(ns+"ups_battery_level_percent", "Battery charge level (0-100%)", labelUPS, nil), + BatteryTimeRemaining: nd(ns+"ups_battery_time_remaining_seconds", "Estimated runtime on battery", labelUPS, nil), + BatteryCharging: nd(ns+"ups_battery_charging", "Battery charging (1/0)", labelUPS, nil), + BatteryMode: nd(ns+"ups_battery_mode", "Running on battery (1/0)", labelUPS, nil), + BatteriesAvailable: nd(ns+"ups_batteries_available", "Number of batteries available", labelUPS, nil), + BatteriesReady: nd(ns+"ups_batteries_ready", "Number of batteries ready", labelUPS, nil), + PowerBudgetWatts: nd(ns+"ups_power_budget_watts", "Total power budget capacity", labelUPS, nil), + PowerOutputWatts: nd(ns+"ups_power_output_watts", "Current power output", labelUPS, nil), + PowerFactor: nd(ns+"ups_power_factor", "Power factor (0-1)", labelUPS, nil), + OutputVoltage: nd(ns+"ups_output_voltage", "Output voltage", labelUPS, nil), + OutputCurrentAmps: nd(ns+"ups_output_current_amps", "Output current in amps", labelUPS, nil), + LoadPercent: nd(ns+"ups_load_percent", "Load as percentage of capacity", labelUPS, nil), + BMSAnomalyCount: nd(ns+"ups_bms_anomaly_count", "Battery management anomalies", labelUPS, nil), + PowerCycleOnRecoveryEnabled: nd(ns+"ups_power_cycle_on_recovery_enabled", "Auto power cycle on AC recovery enabled (1/0)", labelUPS, nil), } } @@ -147,6 +178,10 @@ func (u *promUnifi) exportPDU(r report, d *unifi.PDU) { u.exportPDUstats(r, labels, d.Stat.Sw) u.exportPDUPrtTable(r, labels, d.PortTable) u.exportPDUOutletTable(r, labels, d.OutletTable, d.OutletOverrides) + + if d.VBMSTable != nil { + u.exportPDUVBMS(r, d, append([]string{d.SiteName, d.Name, d.Mac, d.SourceName}, tag)) + } u.exportBYTstats(r, labels, d.TxBytes, d.RxBytes) u.exportSYSstats(r, labels, d.SysStats, d.SystemStats) u.exportSTAcount(r, labels, d.UserNumSta, d.GuestNumSta) @@ -199,6 +234,42 @@ func (u *promUnifi) exportPDUstats(r report, labels []string, sw *unifi.Sw) { }) } +// exportPDUVBMS exports UPS battery health metrics from vbms_table (UPS devices only). +func (u *promUnifi) exportPDUVBMS(r report, d *unifi.PDU, labels []string) { + vbms := d.VBMSTable + + r.send([]*metric{ + {u.PDU.BMSAnomalyCount, gauge, vbms.BMSRunAnomaly, labels}, + {u.PDU.BatteryMode, gauge, vbms.IsBatteryMode.Val, labels}, + {u.PDU.PowerCycleOnRecoveryEnabled, gauge, d.OutletPowerCycleOnACRecoveryEnabled.Val, labels}, + }) + + if vbms.BattPool == nil { + return + } + + bp := vbms.BattPool + loadPct := 0.0 + + if bp.DeviceTotalPowerBudget.Val > 0 { + loadPct = (bp.DeviceTotalPowerOutput.Val / bp.DeviceTotalPowerBudget.Val) * 100 + } + + r.send([]*metric{ + {u.PDU.BatteryLevelPercent, gauge, bp.BatteryLevel, labels}, + {u.PDU.BatteryTimeRemaining, gauge, bp.TimeToRemain, labels}, + {u.PDU.BatteryCharging, gauge, bp.IsCharging.Val, labels}, + {u.PDU.BatteriesAvailable, gauge, bp.BattAvailableCnt, labels}, + {u.PDU.BatteriesReady, gauge, bp.ReadyCnt, labels}, + {u.PDU.PowerBudgetWatts, gauge, bp.DeviceTotalPowerBudget, labels}, + {u.PDU.PowerOutputWatts, gauge, bp.DeviceTotalPowerOutput, labels}, + {u.PDU.PowerFactor, gauge, bp.DeviceTotalPowerFactor, labels}, + {u.PDU.OutputVoltage, gauge, bp.DeviceOutputVoltage, labels}, + {u.PDU.OutputCurrentAmps, gauge, bp.DeviceOutputCurrent, labels}, + {u.PDU.LoadPercent, gauge, loadPct, labels}, + }) +} + // Switch Port Table. func (u *promUnifi) exportPDUPrtTable(r report, labels []string, pt []unifi.Port) { // Per-port data on a switch