Compare commits

...

5 Commits

Author SHA1 Message Date
Zoltán Papp
8f6e916517 [client] Improve error reporting for route socket operations 2026-04-09 17:30:58 +02:00
Zoltán Papp
7a1c2f08b8 [client] Handle EEXIST when adding routes on macOS/BSD
On macOS, routes from a previous session can survive across sleep/wake
cycles even though cleanup reports success. When the new engine instance
tries to add the same route, it fails with "file exists" (EEXIST),
leaving the route untracked by the route manager while the OS still
has it pointing at a stale interface.

Handle this by detecting EEXIST on RTM_ADD, removing the stale route,
and retrying. This matches how Linux silently handles existing routes
via netlink.
2026-04-09 17:19:29 +02:00
Maycon Santos
099c493b18 [management] network map tests (#5795)
* Add network map benchmark and correctness test files

* Add tests for network map components correctness and edge cases

* Skip benchmarks in CI and enhance network map test coverage with new helper functions

* Remove legacy network map benchmarks and tests; refactor components-based test coverage for clarity and scalability.
2026-04-08 21:28:29 +02:00
Pascal Fischer
c1d1229ae0 [management] use NullBool for terminated flag (#5829) 2026-04-08 21:08:43 +02:00
Viktor Liu
94a36cb53e [client] Handle UPnP routers that only support permanent leases (#5826) 2026-04-08 17:59:59 +02:00
10 changed files with 1545 additions and 103 deletions

View File

@@ -6,6 +6,7 @@ import (
"context"
"fmt"
"net"
"regexp"
"sync"
"time"
@@ -15,19 +16,29 @@ import (
const (
defaultMappingTTL = 2 * time.Hour
renewalInterval = defaultMappingTTL / 2
discoveryTimeout = 10 * time.Second
mappingDescription = "NetBird"
)
// upnpErrPermanentLeaseOnly matches UPnP error 725 in SOAP fault XML,
// allowing for whitespace/newlines between tags from different router firmware.
var upnpErrPermanentLeaseOnly = regexp.MustCompile(`<errorCode>\s*725\s*</errorCode>`)
// Mapping represents an active NAT port mapping.
type Mapping struct {
Protocol string
InternalPort uint16
ExternalPort uint16
ExternalIP net.IP
NATType string
// TTL is the lease duration. Zero means a permanent lease that never expires.
TTL time.Duration
}
// TODO: persist mapping state for crash recovery cleanup of permanent leases.
// Currently not done because State.Cleanup requires NAT gateway re-discovery,
// which blocks startup for ~10s when no gateway is present (affects all clients).
type Manager struct {
cancel context.CancelFunc
@@ -43,6 +54,7 @@ type Manager struct {
mu sync.Mutex
}
// NewManager creates a new port forwarding manager.
func NewManager() *Manager {
return &Manager{
stopCtx: make(chan context.Context, 1),
@@ -77,8 +89,7 @@ func (m *Manager) Start(ctx context.Context, wgPort uint16) {
gateway, mapping, err := m.setup(ctx)
if err != nil {
log.Errorf("failed to setup NAT port mapping: %v", err)
log.Infof("port forwarding setup: %v", err)
return
}
@@ -86,7 +97,7 @@ func (m *Manager) Start(ctx context.Context, wgPort uint16) {
m.mapping = mapping
m.mappingLock.Unlock()
m.renewLoop(ctx, gateway)
m.renewLoop(ctx, gateway, mapping.TTL)
select {
case cleanupCtx := <-m.stopCtx:
@@ -145,16 +156,14 @@ func (m *Manager) setup(ctx context.Context) (nat.NAT, *Mapping, error) {
gateway, err := nat.DiscoverGateway(discoverCtx)
if err != nil {
log.Infof("NAT gateway discovery failed: %v (port forwarding disabled)", err)
return nil, nil, err
return nil, nil, fmt.Errorf("discover gateway: %w", err)
}
log.Infof("discovered NAT gateway: %s", gateway.Type())
mapping, err := m.createMapping(ctx, gateway)
if err != nil {
log.Warnf("failed to create port mapping: %v", err)
return nil, nil, err
return nil, nil, fmt.Errorf("create port mapping: %w", err)
}
return gateway, mapping, nil
}
@@ -163,9 +172,18 @@ func (m *Manager) createMapping(ctx context.Context, gateway nat.NAT) (*Mapping,
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
externalPort, err := gateway.AddPortMapping(ctx, "udp", int(m.wgPort), mappingDescription, defaultMappingTTL)
ttl := defaultMappingTTL
externalPort, err := gateway.AddPortMapping(ctx, "udp", int(m.wgPort), mappingDescription, ttl)
if err != nil {
return nil, err
if !isPermanentLeaseRequired(err) {
return nil, err
}
log.Infof("gateway only supports permanent leases, retrying with indefinite duration")
ttl = 0
externalPort, err = gateway.AddPortMapping(ctx, "udp", int(m.wgPort), mappingDescription, ttl)
if err != nil {
return nil, err
}
}
externalIP, err := gateway.GetExternalAddress()
@@ -180,6 +198,7 @@ func (m *Manager) createMapping(ctx context.Context, gateway nat.NAT) (*Mapping,
ExternalPort: uint16(externalPort),
ExternalIP: externalIP,
NATType: gateway.Type(),
TTL: ttl,
}
log.Infof("created port mapping: %d -> %d via %s (external IP: %s)",
@@ -187,8 +206,14 @@ func (m *Manager) createMapping(ctx context.Context, gateway nat.NAT) (*Mapping,
return mapping, nil
}
func (m *Manager) renewLoop(ctx context.Context, gateway nat.NAT) {
ticker := time.NewTicker(renewalInterval)
func (m *Manager) renewLoop(ctx context.Context, gateway nat.NAT, ttl time.Duration) {
if ttl == 0 {
// Permanent mappings don't expire, just wait for cancellation.
<-ctx.Done()
return
}
ticker := time.NewTicker(ttl / 2)
defer ticker.Stop()
for {
@@ -208,7 +233,7 @@ func (m *Manager) renewMapping(ctx context.Context, gateway nat.NAT) error {
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
externalPort, err := gateway.AddPortMapping(ctx, m.mapping.Protocol, int(m.mapping.InternalPort), mappingDescription, defaultMappingTTL)
externalPort, err := gateway.AddPortMapping(ctx, m.mapping.Protocol, int(m.mapping.InternalPort), mappingDescription, m.mapping.TTL)
if err != nil {
return fmt.Errorf("add port mapping: %w", err)
}
@@ -248,3 +273,8 @@ func (m *Manager) startTearDown(ctx context.Context) {
default:
}
}
// isPermanentLeaseRequired checks if a UPnP error indicates the gateway only supports permanent leases (error 725).
func isPermanentLeaseRequired(err error) bool {
return err != nil && upnpErrPermanentLeaseOnly.MatchString(err.Error())
}

View File

@@ -3,15 +3,18 @@ package portforward
import (
"context"
"net"
"time"
)
// Mapping represents port mapping information.
// Mapping represents an active NAT port mapping.
type Mapping struct {
Protocol string
InternalPort uint16
ExternalPort uint16
ExternalIP net.IP
NATType string
// TTL is the lease duration. Zero means a permanent lease that never expires.
TTL time.Duration
}
// Manager is a stub for js/wasm builds where NAT-PMP/UPnP is not supported.

View File

@@ -4,23 +4,25 @@ package portforward
import (
"context"
"fmt"
"net"
"testing"
"time"
"github.com/libp2p/go-nat"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type mockNAT struct {
natType string
deviceAddr net.IP
externalAddr net.IP
internalAddr net.IP
mappings map[int]int
addMappingErr error
deleteMappingErr error
natType string
deviceAddr net.IP
externalAddr net.IP
internalAddr net.IP
mappings map[int]int
addMappingErr error
deleteMappingErr error
onlyPermanentLeases bool
lastTimeout time.Duration
}
func newMockNAT() *mockNAT {
@@ -53,8 +55,12 @@ func (m *mockNAT) AddPortMapping(ctx context.Context, protocol string, internalP
if m.addMappingErr != nil {
return 0, m.addMappingErr
}
if m.onlyPermanentLeases && timeout != 0 {
return 0, fmt.Errorf("SOAP fault. Code: | Explanation: | Detail: <UPnPError xmlns=\"urn:schemas-upnp-org:control-1-0\"><errorCode>725</errorCode><errorDescription>OnlyPermanentLeasesSupported</errorDescription></UPnPError>")
}
externalPort := internalPort
m.mappings[internalPort] = externalPort
m.lastTimeout = timeout
return externalPort, nil
}
@@ -80,6 +86,7 @@ func TestManager_CreateMapping(t *testing.T) {
assert.Equal(t, uint16(51820), mapping.ExternalPort)
assert.Equal(t, "Mock-NAT", mapping.NATType)
assert.Equal(t, net.ParseIP("203.0.113.50").To4(), mapping.ExternalIP.To4())
assert.Equal(t, defaultMappingTTL, mapping.TTL)
}
func TestManager_GetMapping_ReturnsNilWhenNotReady(t *testing.T) {
@@ -131,29 +138,64 @@ func TestManager_Cleanup_NilMapping(t *testing.T) {
m.cleanup(context.Background(), gateway)
}
func TestState_Cleanup(t *testing.T) {
origDiscover := discoverGateway
defer func() { discoverGateway = origDiscover }()
mockGateway := newMockNAT()
mockGateway.mappings[51820] = 51820
discoverGateway = func(ctx context.Context) (nat.NAT, error) {
return mockGateway, nil
}
func TestManager_CreateMapping_PermanentLeaseFallback(t *testing.T) {
m := NewManager()
m.wgPort = 51820
state := &State{
Protocol: "udp",
InternalPort: 51820,
}
gateway := newMockNAT()
gateway.onlyPermanentLeases = true
err := state.Cleanup()
assert.NoError(t, err)
mapping, err := m.createMapping(context.Background(), gateway)
require.NoError(t, err)
require.NotNil(t, mapping)
_, exists := mockGateway.mappings[51820]
assert.False(t, exists, "mapping should be deleted after cleanup")
assert.Equal(t, uint16(51820), mapping.InternalPort)
assert.Equal(t, time.Duration(0), mapping.TTL, "should return zero TTL for permanent lease")
assert.Equal(t, time.Duration(0), gateway.lastTimeout, "should have retried with zero duration")
}
func TestState_Name(t *testing.T) {
state := &State{}
assert.Equal(t, "port_forward_state", state.Name())
func TestIsPermanentLeaseRequired(t *testing.T) {
tests := []struct {
name string
err error
expected bool
}{
{
name: "nil error",
err: nil,
expected: false,
},
{
name: "UPnP error 725",
err: fmt.Errorf("SOAP fault. Code: | Detail: <UPnPError><errorCode>725</errorCode><errorDescription>OnlyPermanentLeasesSupported</errorDescription></UPnPError>"),
expected: true,
},
{
name: "wrapped error with 725",
err: fmt.Errorf("add port mapping: %w", fmt.Errorf("Detail: <errorCode>725</errorCode>")),
expected: true,
},
{
name: "error 725 with newlines in XML",
err: fmt.Errorf("<errorCode>\n 725\n</errorCode>"),
expected: true,
},
{
name: "bare 725 without XML tag",
err: fmt.Errorf("error code 725"),
expected: false,
},
{
name: "unrelated error",
err: fmt.Errorf("connection refused"),
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expected, isPermanentLeaseRequired(tt.err))
})
}
}

View File

@@ -1,50 +0,0 @@
//go:build !js
package portforward
import (
"context"
"fmt"
"github.com/libp2p/go-nat"
log "github.com/sirupsen/logrus"
)
// discoverGateway is the function used for NAT gateway discovery.
// It can be replaced in tests to avoid real network operations.
var discoverGateway = nat.DiscoverGateway
// State is persisted only for crash recovery cleanup
type State struct {
InternalPort uint16 `json:"internal_port,omitempty"`
Protocol string `json:"protocol,omitempty"`
}
func (s *State) Name() string {
return "port_forward_state"
}
// Cleanup implements statemanager.CleanableState for crash recovery
func (s *State) Cleanup() error {
if s.InternalPort == 0 {
return nil
}
log.Infof("cleaning up stale port mapping for port %d", s.InternalPort)
ctx, cancel := context.WithTimeout(context.Background(), discoveryTimeout)
defer cancel()
gateway, err := discoverGateway(ctx)
if err != nil {
// Discovery failure is not an error - gateway may not exist
log.Debugf("cleanup: no gateway found: %v", err)
return nil
}
if err := gateway.DeletePortMapping(ctx, s.Protocol, int(s.InternalPort)); err != nil {
return fmt.Errorf("delete port mapping: %w", err)
}
return nil
}

View File

@@ -105,7 +105,20 @@ func (r *SysOps) FlushMarkedRoutes() error {
}
func (r *SysOps) addToRouteTable(prefix netip.Prefix, nexthop Nexthop) error {
return r.routeSocket(unix.RTM_ADD, prefix, nexthop)
if err := r.routeSocket(unix.RTM_ADD, prefix, nexthop); err != nil {
if !errors.Is(err, unix.EEXIST) {
return err
}
// Route already exists from a previous session that wasn't cleaned up properly
// (e.g. macOS sleep/wake). Remove the stale route and retry.
log.Infof("Route for %s already exists, replacing with new route", prefix)
if err := r.routeSocket(unix.RTM_DELETE, prefix, nexthop); err != nil {
log.Warnf("Failed to remove stale route for %s: %v", prefix, err)
}
return r.routeSocket(unix.RTM_ADD, prefix, nexthop)
}
return nil
}
func (r *SysOps) removeFromRouteTable(prefix netip.Prefix, nexthop Nexthop) error {
@@ -228,7 +241,7 @@ func (r *SysOps) parseRouteResponse(buf []byte) error {
rtMsg := (*unix.RtMsghdr)(unsafe.Pointer(&buf[0]))
if rtMsg.Errno != 0 {
return fmt.Errorf("parse: %d", rtMsg.Errno)
return fmt.Errorf("route socket: %w", syscall.Errno(rtMsg.Errno))
}
return nil

View File

@@ -10,10 +10,6 @@ import (
)
// registerStates registers all states that need crash recovery cleanup.
// Note: portforward.State is intentionally NOT registered here to avoid blocking startup
// for up to 10 seconds during NAT gateway discovery when no gateway is present.
// The gateway reference cannot be persisted across restarts, so cleanup requires re-discovery.
// Port forward cleanup is handled by the Manager during normal operation instead.
func registerStates(mgr *statemanager.Manager) {
mgr.RegisterState(&dns.ShutdownState{})
mgr.RegisterState(&systemops.ShutdownState{})

View File

@@ -12,10 +12,6 @@ import (
)
// registerStates registers all states that need crash recovery cleanup.
// Note: portforward.State is intentionally NOT registered here to avoid blocking startup
// for up to 10 seconds during NAT gateway discovery when no gateway is present.
// The gateway reference cannot be persisted across restarts, so cleanup requires re-discovery.
// Port forward cleanup is handled by the Manager during normal operation instead.
func registerStates(mgr *statemanager.Manager) {
mgr.RegisterState(&dns.ShutdownState{})
mgr.RegisterState(&systemops.ShutdownState{})

View File

@@ -2099,6 +2099,7 @@ func (s *SqlStore) getServices(ctx context.Context, accountID string) ([]*rpserv
var createdAt, certIssuedAt sql.NullTime
var status, proxyCluster, sessionPrivateKey, sessionPublicKey sql.NullString
var mode, source, sourcePeer sql.NullString
var terminated sql.NullBool
err := row.Scan(
&s.ID,
&s.AccountID,
@@ -2119,7 +2120,7 @@ func (s *SqlStore) getServices(ctx context.Context, accountID string) ([]*rpserv
&s.PortAutoAssigned,
&source,
&sourcePeer,
&s.Terminated,
&terminated,
)
if err != nil {
return nil, err
@@ -2160,7 +2161,9 @@ func (s *SqlStore) getServices(ctx context.Context, accountID string) ([]*rpserv
if sourcePeer.Valid {
s.SourcePeer = sourcePeer.String
}
if terminated.Valid {
s.Terminated = terminated.Bool
}
s.Targets = []*rpservice.Target{}
return &s, nil
})

View File

@@ -0,0 +1,217 @@
package types_test
import (
"context"
"fmt"
"os"
"testing"
nbdns "github.com/netbirdio/netbird/dns"
"github.com/netbirdio/netbird/management/server/types"
)
type benchmarkScale struct {
name string
peers int
groups int
}
var defaultScales = []benchmarkScale{
{"100peers_5groups", 100, 5},
{"500peers_20groups", 500, 20},
{"1000peers_50groups", 1000, 50},
{"5000peers_100groups", 5000, 100},
{"10000peers_200groups", 10000, 200},
{"20000peers_200groups", 20000, 200},
{"30000peers_300groups", 30000, 300},
}
func skipCIBenchmark(b *testing.B) {
if os.Getenv("CI") == "true" {
b.Skip("Skipping benchmark in CI")
}
}
// ──────────────────────────────────────────────────────────────────────────────
// Single Peer Network Map Generation
// ──────────────────────────────────────────────────────────────────────────────
// BenchmarkNetworkMapGeneration_Components benchmarks the components-based approach for a single peer.
func BenchmarkNetworkMapGeneration_Components(b *testing.B) {
skipCIBenchmark(b)
for _, scale := range defaultScales {
b.Run(scale.name, func(b *testing.B) {
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
ctx := context.Background()
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetPeerNetworkMapFromComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
}
})
}
}
// ──────────────────────────────────────────────────────────────────────────────
// All Peers (UpdateAccountPeers hot path)
// ──────────────────────────────────────────────────────────────────────────────
// BenchmarkNetworkMapGeneration_AllPeers benchmarks generating network maps for ALL peers.
func BenchmarkNetworkMapGeneration_AllPeers(b *testing.B) {
skipCIBenchmark(b)
scales := []benchmarkScale{
{"100peers_5groups", 100, 5},
{"500peers_20groups", 500, 20},
{"1000peers_50groups", 1000, 50},
{"5000peers_100groups", 5000, 100},
}
for _, scale := range scales {
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
ctx := context.Background()
peerIDs := make([]string, 0, len(account.Peers))
for peerID := range account.Peers {
peerIDs = append(peerIDs, peerID)
}
b.Run("components/"+scale.name, func(b *testing.B) {
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
b.ReportAllocs()
b.ResetTimer()
for range b.N {
for _, peerID := range peerIDs {
_ = account.GetPeerNetworkMapFromComponents(ctx, peerID, nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
}
}
})
}
}
// ──────────────────────────────────────────────────────────────────────────────
// Sub-operations
// ──────────────────────────────────────────────────────────────────────────────
// BenchmarkNetworkMapGeneration_ComponentsCreation benchmarks components extraction.
func BenchmarkNetworkMapGeneration_ComponentsCreation(b *testing.B) {
skipCIBenchmark(b)
for _, scale := range defaultScales {
b.Run(scale.name, func(b *testing.B) {
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
ctx := context.Background()
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetPeerNetworkMapComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, groupIDToUserIDs)
}
})
}
}
// BenchmarkNetworkMapGeneration_ComponentsCalculation benchmarks calculation from pre-built components.
func BenchmarkNetworkMapGeneration_ComponentsCalculation(b *testing.B) {
skipCIBenchmark(b)
for _, scale := range defaultScales {
b.Run(scale.name, func(b *testing.B) {
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
ctx := context.Background()
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
components := account.GetPeerNetworkMapComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, groupIDToUserIDs)
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = types.CalculateNetworkMapFromComponents(ctx, components)
}
})
}
}
// BenchmarkNetworkMapGeneration_PrecomputeMaps benchmarks precomputed map costs.
func BenchmarkNetworkMapGeneration_PrecomputeMaps(b *testing.B) {
skipCIBenchmark(b)
for _, scale := range defaultScales {
b.Run("ResourcePoliciesMap/"+scale.name, func(b *testing.B) {
account, _ := scalableTestAccount(scale.peers, scale.groups)
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetResourcePoliciesMap()
}
})
b.Run("ResourceRoutersMap/"+scale.name, func(b *testing.B) {
account, _ := scalableTestAccount(scale.peers, scale.groups)
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetResourceRoutersMap()
}
})
b.Run("ActiveGroupUsers/"+scale.name, func(b *testing.B) {
account, _ := scalableTestAccount(scale.peers, scale.groups)
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetActiveGroupUsers()
}
})
}
}
// ──────────────────────────────────────────────────────────────────────────────
// Scaling Analysis
// ──────────────────────────────────────────────────────────────────────────────
// BenchmarkNetworkMapGeneration_GroupScaling tests group count impact on performance.
func BenchmarkNetworkMapGeneration_GroupScaling(b *testing.B) {
skipCIBenchmark(b)
groupCounts := []int{1, 5, 20, 50, 100, 200, 500}
for _, numGroups := range groupCounts {
b.Run(fmt.Sprintf("components_%dgroups", numGroups), func(b *testing.B) {
account, validatedPeers := scalableTestAccount(1000, numGroups)
ctx := context.Background()
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetPeerNetworkMapFromComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
}
})
}
}
// BenchmarkNetworkMapGeneration_PeerScaling tests peer count impact on performance.
func BenchmarkNetworkMapGeneration_PeerScaling(b *testing.B) {
skipCIBenchmark(b)
peerCounts := []int{50, 100, 500, 1000, 2000, 5000, 10000, 20000, 30000}
for _, numPeers := range peerCounts {
numGroups := numPeers / 20
if numGroups < 1 {
numGroups = 1
}
b.Run(fmt.Sprintf("components_%dpeers", numPeers), func(b *testing.B) {
account, validatedPeers := scalableTestAccount(numPeers, numGroups)
ctx := context.Background()
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
b.ReportAllocs()
b.ResetTimer()
for range b.N {
_ = account.GetPeerNetworkMapFromComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
}
})
}
}

File diff suppressed because it is too large Load Diff