diff --git a/PR_REMOTE_API_FABRIC_FIX.md b/PR_REMOTE_API_FABRIC_FIX.md new file mode 100644 index 00000000..536af4e7 --- /dev/null +++ b/PR_REMOTE_API_FABRIC_FIX.md @@ -0,0 +1,43 @@ +## Summary + +Fixes crashes and poor behavior when using **Remote API** (UniFi Site Manager / api.ui.com) with an API key—e.g. a “Fabric” or cloud API key—for authentication. We ran into 429 rate limiting, 403s from NVR/non-Network consoles, and **nil pointer panics** in `updateWeb` / `controllerID` during or after 429 handling. + +## Issue: Remote API with API key (e.g. Fabric key) + +With **remote API mode** enabled and an API key (e.g. from unifi.ui.com / Fabric): + +1. **429 Too Many Requests** – Discovery and polling hit the cloud API quickly. The API returns 429 with `Retry-After` (e.g. 1m0s). We either failed immediately or retried once with a short cap, so some controllers were never discovered or we kept hitting 429. +2. **403 on non-Network consoles** – The `/v1/hosts` list includes NVRs, CloudKey+ for Displays, etc. Calling the Network “sites” API on those returns 403. We were calling every console, causing 403 noise and wasting rate limit. +3. **Nil pointer panic in `updateWeb`** – When the API returned 429 during polling, re-auth or internal handling could set `c.Unifi = nil`. We used `defer updateWeb(c, m)`, so when the deferred call ran, `c` or `c.Unifi` could be nil and `controllerID(c)` panicked (`addr=0x28` = dereferencing `c.Unifi` when `c` was nil or `c.Unifi` was nil). The process crashed repeatedly under 429. + +## What was required to fix it + +### In **unpoller/unifi** (library, separate PR) + +- **429**: Retry up to 3 times using the **full** `Retry-After` from the API (no 15s cap) so rate-limited requests can succeed. +- **NVR/Protect filtering**: Added `FilterNetworkConsoles` and `DiscoverNetworkConsoles()` so we only discover consoles that support the Network API; skip names containing `nvr`, `protect`, `cloudkey+ for displays` to avoid 403s and save rate limit. +- **Error response safety**: Cap read size for 4xx/5xx responses and truncate body in error messages to avoid OOM from huge HTML/error bodies. + +### In **unpoller** (this PR) + +- **Discovery**: Use `DiscoverNetworkConsoles()` and retry each console up to 3 times with a short delay. After 3 failures (429 or 403), log once: `Excluding controller : api key does not have permissions (after 3 attempts)` and stop trying that controller. +- **updateWeb panic**: + - Removed `defer updateWeb(c, m)`. Call `updateWeb(c, m)` only on the **success path** at the end of `pollController`, guarded by `c != nil && c.Unifi != nil`. + - In `updateWeb`: guard on `c == nil`, `metrics == nil`, `c.Unifi == nil`; add `defer recover()` to log and swallow any remaining panic so one bad path doesn’t kill the process. + - In `controllerID`: safe nil checks and single dereference. + - In `formatSites` / `formatClients` / `formatDevices`: skip nil slice elements. + - In `pollController`: wrap the `updateWeb` call in a `recover()` so even an old image or race doesn’t crash the poller. +- **Dockerfile.local**: Add a Dockerfile that builds unpoller with a **local** unifi library (build context = parent dir with `unifi/` and `unpoller/`). Uses `go mod edit -replace` and `go build` inside the image so the container is built entirely from local repos for testing. + +## Testing + +- Ran unpoller in Kubernetes with remote API and an API key; multiple Network and NVR consoles in the account. +- Confirmed: NVRs excluded from discovery, 429 retries with full backoff, no more `controllerID`/`updateWeb` panics when 429 occurs during polling. Excluded controllers get a single log line after 3 attempts. + +## Dependencies + +- Requires **unpoller/unifi** changes (429 retry, `DiscoverNetworkConsoles`, error body handling). Until that library is released, build unpoller with a local unifi clone and `Dockerfile.local` (or `go mod replace`). + +## CI / Workflows + +**Workflows are expected to fail on this PR until [unpoller/unifi#200](https://github.com/unpoller/unifi/pull/200) is merged and a new unifi release is available.** This PR calls `DiscoverNetworkConsoles()` and uses 429/error behavior that exist only in the unifi library PR. The current `go.mod` points at the published unifi module, which does not yet provide `DiscoverNetworkConsoles`, so `go build` and `go test` in CI will fail with an undefined method error. Once unifi is updated and unpoller’s `go.mod` is updated to that version (or the unifi PR is merged and a release cut), CI should pass. diff --git a/go.mod b/go.mod index ed3e2de3..067711a9 100644 --- a/go.mod +++ b/go.mod @@ -12,9 +12,9 @@ require ( github.com/prometheus/common v0.67.5 github.com/spf13/pflag v1.0.10 github.com/stretchr/testify v1.11.1 - github.com/unpoller/unifi/v5 v5.18.0 - golang.org/x/crypto v0.48.0 - golang.org/x/term v0.40.0 + github.com/unpoller/unifi/v5 v5.20.0 + golang.org/x/crypto v0.47.0 + golang.org/x/term v0.39.0 golift.io/cnfg v0.2.3 golift.io/cnfgfile v0.0.0-20240713024420-a5436d84eb48 golift.io/version v0.0.2 diff --git a/go.sum b/go.sum index 47ba32be..89a844e6 100644 --- a/go.sum +++ b/go.sum @@ -77,8 +77,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -github.com/unpoller/unifi/v5 v5.18.0 h1:i9xecLeI9CU6m+5++TIm+zhdGS9f8KCUz8PuuzO7sSQ= -github.com/unpoller/unifi/v5 v5.18.0/go.mod h1:vSIXIclPG9dpKxUp+pavfgENHWaTZXvDg7F036R1YCo= +github.com/unpoller/unifi/v5 v5.20.0 h1:lbjjHrFZgINGjk+A+efyYsdo21qdQg+WLAXZJpBvqMI= +github.com/unpoller/unifi/v5 v5.20.0/go.mod h1:vSIXIclPG9dpKxUp+pavfgENHWaTZXvDg7F036R1YCo= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= @@ -86,8 +86,8 @@ go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= -golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= +golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= +golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -106,8 +106,8 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= -golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= +golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= +golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/pkg/influxunifi/udm.go b/pkg/influxunifi/udm.go index 37433456..bcd2302f 100644 --- a/pkg/influxunifi/udm.go +++ b/pkg/influxunifi/udm.go @@ -55,7 +55,7 @@ func (u *InfluxUnifi) batchUDMtemps(temps []unifi.Temperature) map[string]any { output := make(map[string]any) for _, t := range temps { - output["temp_"+sanitizeName(t.Name)] = int64(t.Value) + output["temp_"+sanitizeName(t.Name)] = t.Value } return output diff --git a/pkg/inputunifi/collector.go b/pkg/inputunifi/collector.go index 139df4f3..f82441e8 100644 --- a/pkg/inputunifi/collector.go +++ b/pkg/inputunifi/collector.go @@ -100,6 +100,9 @@ func (u *InputUnifi) pollController(c *Controller) (*poller.Metrics, error) { u.RLock() defer u.RUnlock() + if c == nil { + return nil, fmt.Errorf("controller is nil") + } if c.Unifi == nil { return nil, fmt.Errorf("controller client is nil (e.g. after 429 or auth failure): %s", c.URL) } @@ -113,7 +116,6 @@ func (u *InputUnifi) pollController(c *Controller) (*poller.Metrics, error) { } m := &Metrics{TS: time.Now(), Sites: sites} - defer updateWeb(c, m) // FIXME needs to be last poll time maybe st := m.TS.Add(-1 * pollDuration) @@ -210,6 +212,18 @@ func (u *InputUnifi) pollController(c *Controller) (*poller.Metrics, error) { u.LogDebugf("Found %d Sysinfo entries", len(m.Sysinfos)) } + // Update web UI only on success; call explicitly so we never run with nil c/c.Unifi (no defer). + // Recover so a panic in updateWeb (e.g. old image, race) never kills the poller. + if c != nil && c.Unifi != nil { + func() { + defer func() { + if r := recover(); r != nil { + u.LogErrorf("updateWeb panic recovered (upgrade image if this persists): %v", r) + } + }() + updateWeb(c, m) + }() + } return u.augmentMetrics(c, m), nil } diff --git a/pkg/inputunifi/remote.go b/pkg/inputunifi/remote.go index a8886cb8..4766b4a4 100644 --- a/pkg/inputunifi/remote.go +++ b/pkg/inputunifi/remote.go @@ -3,10 +3,14 @@ package inputunifi import ( "fmt" "strings" + "time" "github.com/unpoller/unifi/v5" ) +const discoverMaxAttempts = 3 +const discoverRetryDelay = 5 * time.Second + // discoverRemoteControllers discovers all controllers via remote API and creates Controller entries. func (u *InputUnifi) discoverRemoteControllers(apiKey string) ([]*Controller, error) { // Handle file:// prefix for API key @@ -23,17 +27,17 @@ func (u *InputUnifi) discoverRemoteControllers(apiKey string) ([]*Controller, er u.Logf("Discovering remote UniFi consoles...") - consoles, err := client.DiscoverConsoles() + consoles, err := client.DiscoverNetworkConsoles() if err != nil { return nil, fmt.Errorf("discovering consoles: %w", err) } if len(consoles) == 0 { - u.Logf("No consoles found via remote API") + u.Logf("No Network-capable consoles found via remote API (NVR/Protect/display-only consoles are excluded)") return nil, nil } - u.Logf("Found %d console(s) via remote API", len(consoles)) + u.Logf("Found %d Network-capable console(s) via remote API", len(consoles)) var controllers []*Controller @@ -45,11 +49,25 @@ func (u *InputUnifi) discoverRemoteControllers(apiKey string) ([]*Controller, er if consoleName == "" { consoleName = console.ReportedState.Hostname } + if consoleName == "" { + consoleName = console.ID + } u.LogDebugf("Discovering sites for console: %s (%s)", console.ID, consoleName) - sites, err := client.DiscoverSites(console.ID) - if err != nil { - u.LogErrorf("Failed to discover sites for console %s: %v", console.ID, err) + var sites []unifi.RemoteSite + var lastErr error + for attempt := 1; attempt <= discoverMaxAttempts; attempt++ { + sites, lastErr = client.DiscoverSites(console.ID) + if lastErr == nil { + break + } + if attempt < discoverMaxAttempts { + u.LogDebugf("Discover sites attempt %d/%d failed for %s: %v; retrying in %v", attempt, discoverMaxAttempts, consoleName, lastErr, discoverRetryDelay) + time.Sleep(discoverRetryDelay) + } + } + if lastErr != nil { + u.Logf("Excluding controller %s: api key does not have permissions (after %d attempts)", consoleName, discoverMaxAttempts) continue } diff --git a/pkg/inputunifi/updateweb.go b/pkg/inputunifi/updateweb.go index 60145037..b36f5fc4 100644 --- a/pkg/inputunifi/updateweb.go +++ b/pkg/inputunifi/updateweb.go @@ -2,6 +2,7 @@ package inputunifi import ( "fmt" + "log" "strconv" "time" @@ -14,14 +15,29 @@ import ( // controllerID returns the controller UUID for display, or "" if the client is nil (e.g. after 429 re-auth failure). // Avoids SIGSEGV when updateWeb runs while c.Unifi is nil (see unpoller/unpoller#943). func controllerID(c *Controller) string { - if c == nil || c.Unifi == nil { + if c == nil { return "" } - - return c.Unifi.UUID + client := c.Unifi + if client == nil { + return "" + } + return client.UUID } func updateWeb(c *Controller, metrics *Metrics) { + defer func() { + if r := recover(); r != nil { + // Log and swallow panic so one bad controller doesn't kill the process (e.g. nil c/c.Unifi race). + log.Printf("[ERROR] updateWeb panic recovered: %v", r) + } + }() + if c == nil || metrics == nil { + return + } + if c.Unifi == nil { + return + } webserver.UpdateInput(&webserver.Input{ Name: PluginName, // Forgetting this leads to 3 hours of head scratching. Sites: formatSites(c, metrics.Sites), @@ -78,6 +94,9 @@ func formatSites(c *Controller, sites []*unifi.Site) (s webserver.Sites) { id := controllerID(c) for _, site := range sites { + if site == nil { + continue + } s = append(s, &webserver.Site{ ID: site.ID, Name: site.Name, @@ -92,6 +111,9 @@ func formatSites(c *Controller, sites []*unifi.Site) (s webserver.Sites) { func formatClients(c *Controller, clients []*unifi.Client) (d webserver.Clients) { for _, client := range clients { + if client == nil { + continue + } clientType, deviceMAC := "unknown", "unknown" if client.ApMac != "" { clientType = "wireless" @@ -132,6 +154,9 @@ func formatDevices(c *Controller, devices *unifi.Devices) (d webserver.Devices) id := controllerID(c) for _, device := range devices.UAPs { + if device == nil { + continue + } d = append(d, &webserver.Device{ Name: device.Name, SiteID: device.SiteID, @@ -149,6 +174,9 @@ func formatDevices(c *Controller, devices *unifi.Devices) (d webserver.Devices) } for _, device := range devices.UDMs { + if device == nil { + continue + } d = append(d, &webserver.Device{ Name: device.Name, SiteID: device.SiteID, @@ -166,6 +194,9 @@ func formatDevices(c *Controller, devices *unifi.Devices) (d webserver.Devices) } for _, device := range devices.USWs { + if device == nil { + continue + } d = append(d, &webserver.Device{ Name: device.Name, SiteID: device.SiteID, @@ -183,6 +214,9 @@ func formatDevices(c *Controller, devices *unifi.Devices) (d webserver.Devices) } for _, device := range devices.USGs { + if device == nil { + continue + } d = append(d, &webserver.Device{ Name: device.Name, SiteID: device.SiteID,