[client] Fix exit node menu not refreshing on Windows (#5553)

* [client] Fix exit node menu not refreshing on Windows TrayOpenedCh is not implemented in the systray library on Windows, so exit nodes were never refreshed after the initial connect. Combined with the management sync not having populated routes yet when the Connected status fires, this caused the exit node menu to remain empty permanently after disconnect/reconnect cycles. Add a background poller on Windows that refreshes exit nodes while connected, with fast initial polling to catch routes from management sync followed by a steady 10s interval. On macOS/Linux, TrayOpenedCh continues to handle refreshes on each tray open. Also fix a data race on connectClient assignment in the server's connect() method and add nil checks in CleanState/DeleteState to prevent panics when connectClient is nil. * Remove unused exitNodeIDs * Remove unused exitNodeState struct
2026-03-31 06:24:18 -04:00 · 2026-03-09 18:39:11 +01:00
parent 11eb725ac8
commit 15aa6bae1b
6 changed files with 267 additions and 38 deletions
--- a/client/server/server.go
+++ b/client/server/server.go
@@ -1625,9 +1625,14 @@ func (s *Server) GetFeatures(ctx context.Context, msg *proto.GetFeaturesRequest)

 func (s *Server) connect(ctx context.Context, config *profilemanager.Config, statusRecorder *peer.Status, doInitialAutoUpdate bool, runningChan chan struct{}) error {
 	log.Tracef("running client connection")
-	s.connectClient = internal.NewConnectClient(ctx, config, statusRecorder, doInitialAutoUpdate)
-	s.connectClient.SetSyncResponsePersistence(s.persistSyncResponse)
-	if err := s.connectClient.Run(runningChan, s.logFile); err != nil {
+	client := internal.NewConnectClient(ctx, config, statusRecorder, doInitialAutoUpdate)
+	client.SetSyncResponsePersistence(s.persistSyncResponse)
+
+	s.mutex.Lock()
+	s.connectClient = client
+	s.mutex.Unlock()
+
+	if err := client.Run(runningChan, s.logFile); err != nil {
 		return err
 	}
 	return nil
--- a/client/server/server_connect_test.go
+++ b/client/server/server_connect_test.go
@@ -0,0 +1,187 @@
+package server
+
+import (
+	"context"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/netbirdio/netbird/client/internal"
+	"github.com/netbirdio/netbird/client/internal/peer"
+	"github.com/netbirdio/netbird/client/proto"
+)
+
+func newTestServer() *Server {
+	return &Server{
+		rootCtx:        context.Background(),
+		statusRecorder: peer.NewRecorder(""),
+	}
+}
+
+func newDummyConnectClient(ctx context.Context) *internal.ConnectClient {
+	return internal.NewConnectClient(ctx, nil, nil, false)
+}
+
+// TestConnectSetsClientWithMutex validates that connect() sets s.connectClient
+// under mutex protection so concurrent readers see a consistent value.
+func TestConnectSetsClientWithMutex(t *testing.T) {
+	s := newTestServer()
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Manually simulate what connect() does (without calling Run which panics without full setup)
+	client := newDummyConnectClient(ctx)
+
+	s.mutex.Lock()
+	s.connectClient = client
+	s.mutex.Unlock()
+
+	// Verify the assignment is visible under mutex
+	s.mutex.Lock()
+	assert.Equal(t, client, s.connectClient, "connectClient should be set")
+	s.mutex.Unlock()
+}
+
+// TestConcurrentConnectClientAccess validates that concurrent reads of
+// s.connectClient under mutex don't race with a write.
+func TestConcurrentConnectClientAccess(t *testing.T) {
+	s := newTestServer()
+	ctx := context.Background()
+	client := newDummyConnectClient(ctx)
+
+	var wg sync.WaitGroup
+	nilCount := 0
+	setCount := 0
+	var mu sync.Mutex
+
+	// Start readers
+	for i := 0; i < 50; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			s.mutex.Lock()
+			c := s.connectClient
+			s.mutex.Unlock()
+
+			mu.Lock()
+			defer mu.Unlock()
+			if c == nil {
+				nilCount++
+			} else {
+				setCount++
+			}
+		}()
+	}
+
+	// Simulate connect() writing under mutex
+	time.Sleep(5 * time.Millisecond)
+	s.mutex.Lock()
+	s.connectClient = client
+	s.mutex.Unlock()
+
+	wg.Wait()
+
+	assert.Equal(t, 50, nilCount+setCount, "all goroutines should complete without panic")
+}
+
+// TestCleanupConnection_ClearsConnectClient validates that cleanupConnection
+// properly nils out connectClient.
+func TestCleanupConnection_ClearsConnectClient(t *testing.T) {
+	s := newTestServer()
+	_, cancel := context.WithCancel(context.Background())
+	s.actCancel = cancel
+
+	s.connectClient = newDummyConnectClient(context.Background())
+	s.clientRunning = true
+
+	err := s.cleanupConnection()
+	require.NoError(t, err)
+
+	assert.Nil(t, s.connectClient, "connectClient should be nil after cleanup")
+}
+
+// TestCleanState_NilConnectClient validates that CleanState doesn't panic
+// when connectClient is nil.
+func TestCleanState_NilConnectClient(t *testing.T) {
+	s := newTestServer()
+	s.connectClient = nil
+	s.profileManager = nil // will cause error if it tries to proceed past the nil check
+
+	// Should not panic — the nil check should prevent calling Status() on nil
+	assert.NotPanics(t, func() {
+		_, _ = s.CleanState(context.Background(), &proto.CleanStateRequest{All: true})
+	})
+}
+
+// TestDeleteState_NilConnectClient validates that DeleteState doesn't panic
+// when connectClient is nil.
+func TestDeleteState_NilConnectClient(t *testing.T) {
+	s := newTestServer()
+	s.connectClient = nil
+	s.profileManager = nil
+
+	assert.NotPanics(t, func() {
+		_, _ = s.DeleteState(context.Background(), &proto.DeleteStateRequest{All: true})
+	})
+}
+
+// TestDownThenUp_StaleRunningChan documents the known state issue where
+// clientRunningChan from a previous connection is already closed, causing
+// waitForUp() to return immediately on reconnect.
+func TestDownThenUp_StaleRunningChan(t *testing.T) {
+	s := newTestServer()
+
+	// Simulate state after a successful connection
+	s.clientRunning = true
+	s.clientRunningChan = make(chan struct{})
+	close(s.clientRunningChan) // closed when engine started
+	s.clientGiveUpChan = make(chan struct{})
+	s.connectClient = newDummyConnectClient(context.Background())
+
+	_, cancel := context.WithCancel(context.Background())
+	s.actCancel = cancel
+
+	// Simulate Down(): cleanupConnection sets connectClient = nil
+	s.mutex.Lock()
+	err := s.cleanupConnection()
+	s.mutex.Unlock()
+	require.NoError(t, err)
+
+	// After cleanup: connectClient is nil, clientRunning still true
+	// (goroutine hasn't exited yet)
+	s.mutex.Lock()
+	assert.Nil(t, s.connectClient, "connectClient should be nil after cleanup")
+	assert.True(t, s.clientRunning, "clientRunning still true until goroutine exits")
+	s.mutex.Unlock()
+
+	// waitForUp() returns immediately due to stale closed clientRunningChan
+	ctx, ctxCancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer ctxCancel()
+
+	waitDone := make(chan error, 1)
+	go func() {
+		_, err := s.waitForUp(ctx)
+		waitDone <- err
+	}()
+
+	select {
+	case err := <-waitDone:
+		assert.NoError(t, err, "waitForUp returns success on stale channel")
+		// But connectClient is still nil — this is the stale state issue
+		s.mutex.Lock()
+		assert.Nil(t, s.connectClient, "connectClient is nil despite waitForUp success")
+		s.mutex.Unlock()
+	case <-time.After(1 * time.Second):
+		t.Fatal("waitForUp should have returned immediately due to stale closed channel")
+	}
+}
+
+// TestConnectClient_EngineNilOnFreshClient validates that a newly created
+// ConnectClient has nil Engine (before Run is called).
+func TestConnectClient_EngineNilOnFreshClient(t *testing.T) {
+	client := newDummyConnectClient(context.Background())
+	assert.Nil(t, client.Engine(), "engine should be nil on fresh ConnectClient")
+}
--- a/client/server/state.go
+++ b/client/server/state.go
@@ -39,7 +39,7 @@ func (s *Server) ListStates(_ context.Context, _ *proto.ListStatesRequest) (*pro

 // CleanState handles cleaning of states (performing cleanup operations)
 func (s *Server) CleanState(ctx context.Context, req *proto.CleanStateRequest) (*proto.CleanStateResponse, error) {
-	if s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting {
+	if s.connectClient != nil && (s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting) {
 		return nil, status.Errorf(codes.FailedPrecondition, "cannot clean state while connecting or connected, run 'netbird down' first.")
 	}

@@ -82,7 +82,7 @@ func (s *Server) CleanState(ctx context.Context, req *proto.CleanStateRequest) (

 // DeleteState handles deletion of states without cleanup
 func (s *Server) DeleteState(ctx context.Context, req *proto.DeleteStateRequest) (*proto.DeleteStateResponse, error) {
-	if s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting {
+	if s.connectClient != nil && (s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting) {
 		return nil, status.Errorf(codes.FailedPrecondition, "cannot clean state while connecting or connected, run 'netbird down' first.")
 	}

--- a/client/ui/client_ui.go
+++ b/client/ui/client_ui.go
@@ -323,7 +323,7 @@ type serviceClient struct {

 	exitNodeMu           sync.Mutex
 	mExitNodeItems       []menuHandler
-	exitNodeStates       []exitNodeState
+	exitNodeRetryCancel  context.CancelFunc
 	mExitNodeDeselectAll *systray.MenuItem
 	logFile              string
 	wLoginURL            fyne.Window
@@ -924,7 +924,7 @@ func (s *serviceClient) updateStatus() error {
 			s.mDown.Enable()
 			s.mNetworks.Enable()
 			s.mExitNode.Enable()
-			go s.updateExitNodes()
+			s.startExitNodeRefresh()
 			systrayIconState = true
 		case status.Status == string(internal.StatusConnecting):
 			s.setConnectingStatus()
@@ -985,6 +985,7 @@ func (s *serviceClient) setDisconnectedStatus() {
 	s.mUp.Enable()
 	s.mNetworks.Disable()
 	s.mExitNode.Disable()
+	s.cancelExitNodeRetry()
 	go s.updateExitNodes()
 }

--- a/client/ui/event_handler.go
+++ b/client/ui/event_handler.go
@@ -100,8 +100,7 @@ func (h *eventHandler) handleConnectClick() {

 func (h *eventHandler) handleDisconnectClick() {
 	h.client.mDown.Disable()
-
-	h.client.exitNodeStates = []exitNodeState{}
+	h.client.cancelExitNodeRetry()

 	if h.client.connectCancel != nil {
 		log.Debugf("cancelling ongoing connect operation")
--- a/client/ui/network.go
+++ b/client/ui/network.go
@@ -6,7 +6,6 @@ import (
 	"context"
 	"fmt"
 	"runtime"
-	"slices"
 	"sort"
 	"strings"
 	"time"
@@ -34,11 +33,6 @@ const (

 type filter string

-type exitNodeState struct {
-	id       string
-	selected bool
-}
-
 func (s *serviceClient) showNetworksUI() {
 	s.wNetworks = s.app.NewWindow("Networks")
 	s.wNetworks.SetOnClosed(s.cancel)
@@ -335,16 +329,75 @@ func (s *serviceClient) updateNetworksBasedOnDisplayTab(tabs *container.AppTabs,
 	s.updateNetworks(grid, f)
 }

-func (s *serviceClient) updateExitNodes() {
+// startExitNodeRefresh initiates exit node menu refresh after connecting.
+// On Windows, TrayOpenedCh is not supported by the systray library, so we use
+// a background poller to keep exit nodes in sync while connected.
+// On macOS/Linux, TrayOpenedCh handles refreshes on each tray open.
+func (s *serviceClient) startExitNodeRefresh() {
+	s.cancelExitNodeRetry()
+
+	if runtime.GOOS == "windows" {
+		ctx, cancel := context.WithCancel(s.ctx)
+		s.exitNodeMu.Lock()
+		s.exitNodeRetryCancel = cancel
+		s.exitNodeMu.Unlock()
+
+		go s.pollExitNodes(ctx)
+	} else {
+		go s.updateExitNodes()
+	}
+}
+
+func (s *serviceClient) cancelExitNodeRetry() {
+	s.exitNodeMu.Lock()
+	if s.exitNodeRetryCancel != nil {
+		s.exitNodeRetryCancel()
+		s.exitNodeRetryCancel = nil
+	}
+	s.exitNodeMu.Unlock()
+}
+
+// pollExitNodes periodically refreshes exit nodes while connected.
+// Uses a short initial interval to catch routes from the management sync,
+// then switches to a longer interval for ongoing updates.
+func (s *serviceClient) pollExitNodes(ctx context.Context) {
+	// Initial fast polling to catch routes as they appear after connect.
+	for i := 0; i < 5; i++ {
+		if s.updateExitNodes() {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(2 * time.Second):
+		}
+	}
+
+	ticker := time.NewTicker(10 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			s.updateExitNodes()
+		}
+	}
+}
+
+// updateExitNodes fetches exit nodes from the daemon and recreates the menu.
+// Returns true if exit nodes were found.
+func (s *serviceClient) updateExitNodes() bool {
 	conn, err := s.getSrvClient(defaultFailTimeout)
 	if err != nil {
 		log.Errorf("get client: %v", err)
-		return
+		return false
 	}
 	exitNodes, err := s.getExitNodes(conn)
 	if err != nil {
 		log.Errorf("get exit nodes: %v", err)
-		return
+		return false
 	}

 	s.exitNodeMu.Lock()
@@ -354,28 +407,14 @@ func (s *serviceClient) updateExitNodes() {

 	if len(s.mExitNodeItems) > 0 {
 		s.mExitNode.Enable()
-	} else {
-		s.mExitNode.Disable()
+		return true
 	}
+
+	s.mExitNode.Disable()
+	return false
 }

 func (s *serviceClient) recreateExitNodeMenu(exitNodes []*proto.Network) {
-	var exitNodeIDs []exitNodeState
-	for _, node := range exitNodes {
-		exitNodeIDs = append(exitNodeIDs, exitNodeState{
-			id:       node.ID,
-			selected: node.Selected,
-		})
-	}
-
-	sort.Slice(exitNodeIDs, func(i, j int) bool {
-		return exitNodeIDs[i].id < exitNodeIDs[j].id
-	})
-	if slices.Equal(s.exitNodeStates, exitNodeIDs) {
-		log.Debug("Exit node menu already up to date")
-		return
-	}
-
 	for _, node := range s.mExitNodeItems {
 		node.cancel()
 		node.Hide()
@@ -413,8 +452,6 @@ func (s *serviceClient) recreateExitNodeMenu(exitNodes []*proto.Network) {
 		go s.handleChecked(ctx, node.ID, menuItem)
 	}

-	s.exitNodeStates = exitNodeIDs
-
 	if showDeselectAll {
 		s.mExitNode.AddSeparator()
 		deselectAllItem := s.mExitNode.AddSubMenuItem("Deselect All", "Deselect All")