Update infrastructure_files/getting-started.sh

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Add CrowdSec LAPI container to self-hosted setup script
2026-04-14 21:46:19 -04:00 · 2026-04-14 13:39:16 +02:00 · 2026-04-14 12:54:15 +02:00 · 2026-04-13 10:42:24 +02:00 · 2026-04-13 09:38:38 +02:00
7 changed files with 595 additions and 81 deletions
--- a/client/internal/routemanager/manager.go
+++ b/client/internal/routemanager/manager.go
@@ -168,6 +168,7 @@ func (m *DefaultManager) setupAndroidRoutes(config ManagerConfig) {
 			NetworkType: route.IPv4Network,
 		}
 		cr = append(cr, fakeIPRoute)
+		m.notifier.SetFakeIPRoute(fakeIPRoute)
 	}

 	m.notifier.SetInitialClientRoutes(cr, routesForComparison)
--- a/client/internal/routemanager/notifier/notifier_android.go
+++ b/client/internal/routemanager/notifier/notifier_android.go
@@ -16,6 +16,7 @@ import (
 type Notifier struct {
 	initialRoutes []*route.Route
 	currentRoutes []*route.Route
+	fakeIPRoute   *route.Route

 	listener    listener.NetworkChangeListener
 	listenerMux sync.Mutex
@@ -31,13 +32,17 @@ func (n *Notifier) SetListener(listener listener.NetworkChangeListener) {
 	n.listener = listener
 }

-// SetInitialClientRoutes stores the full initial route set (including fake IP blocks)
-// and a separate comparison set (without fake IP blocks) for diff detection.
+// SetInitialClientRoutes stores the initial route sets for TUN configuration.
 func (n *Notifier) SetInitialClientRoutes(initialRoutes []*route.Route, routesForComparison []*route.Route) {
 	n.initialRoutes = filterStatic(initialRoutes)
 	n.currentRoutes = filterStatic(routesForComparison)
 }

+// SetFakeIPRoute stores the fake IP route to be included in every TUN rebuild.
+func (n *Notifier) SetFakeIPRoute(r *route.Route) {
+	n.fakeIPRoute = r
+}
+
 func (n *Notifier) OnNewRoutes(idMap route.HAMap) {
 	var newRoutes []*route.Route
 	for _, routes := range idMap {
@@ -69,7 +74,9 @@ func (n *Notifier) notify() {
 	}

 	allRoutes := slices.Clone(n.currentRoutes)
-	allRoutes = append(allRoutes, n.extraInitialRoutes()...)
+	if n.fakeIPRoute != nil {
+		allRoutes = append(allRoutes, n.fakeIPRoute)
+	}

 	routeStrings := n.routesToStrings(allRoutes)
 	sort.Strings(routeStrings)
@@ -78,23 +85,6 @@ func (n *Notifier) notify() {
 	}(n.listener)
 }

-// extraInitialRoutes returns initialRoutes whose network prefix is absent
-// from currentRoutes (e.g. the fake IP block added at setup time).
-func (n *Notifier) extraInitialRoutes() []*route.Route {
-	currentNets := make(map[netip.Prefix]struct{}, len(n.currentRoutes))
-	for _, r := range n.currentRoutes {
-		currentNets[r.Network] = struct{}{}
-	}
-
-	var extra []*route.Route
-	for _, r := range n.initialRoutes {
-		if _, ok := currentNets[r.Network]; !ok {
-			extra = append(extra, r)
-		}
-	}
-	return extra
-}
-
 func filterStatic(routes []*route.Route) []*route.Route {
 	out := make([]*route.Route, 0, len(routes))
 	for _, r := range routes {
--- a/client/internal/routemanager/notifier/notifier_ios.go
+++ b/client/internal/routemanager/notifier/notifier_ios.go
@@ -34,6 +34,10 @@ func (n *Notifier) SetInitialClientRoutes([]*route.Route, []*route.Route) {
 	// iOS doesn't care about initial routes
 }

+func (n *Notifier) SetFakeIPRoute(*route.Route) {
+	// Not used on iOS
+}
+
 func (n *Notifier) OnNewRoutes(route.HAMap) {
 	// Not used on iOS
 }
--- a/client/internal/routemanager/notifier/notifier_other.go
+++ b/client/internal/routemanager/notifier/notifier_other.go
@@ -23,6 +23,10 @@ func (n *Notifier) SetInitialClientRoutes([]*route.Route, []*route.Route) {
 	// Not used on non-mobile platforms
 }

+func (n *Notifier) SetFakeIPRoute(*route.Route) {
+	// Not used on non-mobile platforms
+}
+
 func (n *Notifier) OnNewRoutes(idMap route.HAMap) {
 	// Not used on non-mobile platforms
 }
--- a/flow/client/client.go
+++ b/flow/client/client.go
@@ -14,7 +14,6 @@ import (
 	log "github.com/sirupsen/logrus"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/connectivity"
 	"google.golang.org/grpc/credentials"
 	"google.golang.org/grpc/credentials/insecure"
 	"google.golang.org/grpc/keepalive"
@@ -26,11 +25,22 @@ import (
 	"github.com/netbirdio/netbird/util/wsproxy"
 )

+var ErrClientClosed = errors.New("client is closed")
+
+// minHealthyDuration is the minimum time a stream must survive before a failure
+// resets the backoff timer. Streams that fail faster are considered unhealthy and
+// should not reset backoff, so that MaxElapsedTime can eventually stop retries.
+const minHealthyDuration = 5 * time.Second
+
 type GRPCClient struct {
 	realClient proto.FlowServiceClient
 	clientConn *grpc.ClientConn
 	stream     proto.FlowService_EventsClient
-	streamMu   sync.Mutex
+	target     string
+	opts       []grpc.DialOption
+	closed     bool       // prevent creating conn in the middle of the Close
+	receiving  bool       // prevent concurrent Receive calls
+	mu         sync.Mutex // protects clientConn, realClient, stream, closed, and receiving
 }

 func NewClient(addr, payload, signature string, interval time.Duration) (*GRPCClient, error) {
@@ -65,7 +75,8 @@ func NewClient(addr, payload, signature string, interval time.Duration) (*GRPCCl
 		grpc.WithDefaultServiceConfig(`{"healthCheckConfig": {"serviceName": ""}}`),
 	)

-	conn, err := grpc.NewClient(fmt.Sprintf("%s:%s", parsedURL.Hostname(), parsedURL.Port()), opts...)
+	target := parsedURL.Host
+	conn, err := grpc.NewClient(target, opts...)
 	if err != nil {
 		return nil, fmt.Errorf("creating new grpc client: %w", err)
 	}
@@ -73,30 +84,73 @@ func NewClient(addr, payload, signature string, interval time.Duration) (*GRPCCl
 	return &GRPCClient{
 		realClient: proto.NewFlowServiceClient(conn),
 		clientConn: conn,
+		target:     target,
+		opts:       opts,
 	}, nil
 }

 func (c *GRPCClient) Close() error {
-	c.streamMu.Lock()
-	defer c.streamMu.Unlock()
-
+	c.mu.Lock()
+	c.closed = true
 	c.stream = nil
-	if err := c.clientConn.Close(); err != nil && !errors.Is(err, context.Canceled) {
+	conn := c.clientConn
+	c.clientConn = nil
+	c.mu.Unlock()
+
+	if conn == nil {
+		return nil
+	}
+
+	if err := conn.Close(); err != nil && !errors.Is(err, context.Canceled) {
 		return fmt.Errorf("close client connection: %w", err)
 	}

 	return nil
 }

+func (c *GRPCClient) Send(event *proto.FlowEvent) error {
+	c.mu.Lock()
+	stream := c.stream
+	c.mu.Unlock()
+
+	if stream == nil {
+		return errors.New("stream not initialized")
+	}
+
+	if err := stream.Send(event); err != nil {
+		return fmt.Errorf("send flow event: %w", err)
+	}
+
+	return nil
+}
+
 func (c *GRPCClient) Receive(ctx context.Context, interval time.Duration, msgHandler func(msg *proto.FlowEventAck) error) error {
+	c.mu.Lock()
+	if c.receiving {
+		c.mu.Unlock()
+		return errors.New("concurrent Receive calls are not supported")
+	}
+	c.receiving = true
+	c.mu.Unlock()
+	defer func() {
+		c.mu.Lock()
+		c.receiving = false
+		c.mu.Unlock()
+	}()
+
 	backOff := defaultBackoff(ctx, interval)
 	operation := func() error {
-		if err := c.establishStreamAndReceive(ctx, msgHandler); err != nil {
-			if s, ok := status.FromError(err); ok && s.Code() == codes.Canceled {
-				return fmt.Errorf("receive: %w: %w", err, context.Canceled)
-			}
+		stream, err := c.establishStream(ctx)
+		if err != nil {
+			log.Errorf("failed to establish flow stream, retrying: %v", err)
+			return c.handleRetryableError(err, time.Time{}, backOff)
+		}
+
+		streamStart := time.Now()
+
+		if err := c.receive(stream, msgHandler); err != nil {
 			log.Errorf("receive failed: %v", err)
-			return fmt.Errorf("receive: %w", err)
+			return c.handleRetryableError(err, streamStart, backOff)
 		}
 		return nil
 	}
@@ -108,37 +162,106 @@ func (c *GRPCClient) Receive(ctx context.Context, interval time.Duration, msgHan
 	return nil
 }

-func (c *GRPCClient) establishStreamAndReceive(ctx context.Context, msgHandler func(msg *proto.FlowEventAck) error) error {
-	if c.clientConn.GetState() == connectivity.Shutdown {
-		return errors.New("connection to flow receiver has been shut down")
+// handleRetryableError resets the backoff timer if the stream was healthy long
+// enough and recreates the underlying ClientConn so that gRPC's internal
+// subchannel backoff does not accumulate and compete with our own retry timer.
+// A zero streamStart means the stream was never established.
+func (c *GRPCClient) handleRetryableError(err error, streamStart time.Time, backOff backoff.BackOff) error {
+	if isContextDone(err) {
+		return backoff.Permanent(err)
 	}

-	stream, err := c.realClient.Events(ctx, grpc.WaitForReady(true))
-	if err != nil {
-		return fmt.Errorf("create event stream: %w", err)
+	var permErr *backoff.PermanentError
+	if errors.As(err, &permErr) {
+		return err
 	}

-	err = stream.Send(&proto.FlowEvent{IsInitiator: true})
+	// Reset the backoff so the next retry starts with a short delay instead of
+	// continuing the already-elapsed timer. Only do this if the stream was healthy
+	// long enough; short-lived connect/drop cycles must not defeat MaxElapsedTime.
+	if !streamStart.IsZero() && time.Since(streamStart) >= minHealthyDuration {
+		backOff.Reset()
+	}
+
+	if recreateErr := c.recreateConnection(); recreateErr != nil {
+		log.Errorf("recreate connection: %v", recreateErr)
+		return recreateErr
+	}
+
+	log.Infof("connection recreated, retrying stream")
+	return fmt.Errorf("retrying after error: %w", err)
+}
+
+func (c *GRPCClient) recreateConnection() error {
+	c.mu.Lock()
+	if c.closed {
+		c.mu.Unlock()
+		return backoff.Permanent(ErrClientClosed)
+	}
+
+	conn, err := grpc.NewClient(c.target, c.opts...)
 	if err != nil {
-		log.Infof("failed to send initiator message to flow receiver but will attempt to continue. Error: %s", err)
+		c.mu.Unlock()
+		return fmt.Errorf("create new connection: %w", err)
+	}
+
+	old := c.clientConn
+	c.clientConn = conn
+	c.realClient = proto.NewFlowServiceClient(conn)
+	c.stream = nil
+	c.mu.Unlock()
+
+	_ = old.Close()
+
+	return nil
+}
+
+func (c *GRPCClient) establishStream(ctx context.Context) (proto.FlowService_EventsClient, error) {
+	c.mu.Lock()
+	if c.closed {
+		c.mu.Unlock()
+		return nil, backoff.Permanent(ErrClientClosed)
+	}
+	cl := c.realClient
+	c.mu.Unlock()
+
+	// open stream outside the lock — blocking operation
+	stream, err := cl.Events(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("create event stream: %w", err)
+	}
+	streamReady := false
+	defer func() {
+		if !streamReady {
+			_ = stream.CloseSend()
+		}
+	}()
+
+	if err = stream.Send(&proto.FlowEvent{IsInitiator: true}); err != nil {
+		return nil, fmt.Errorf("send initiator: %w", err)
 	}

 	if err = checkHeader(stream); err != nil {
-		return fmt.Errorf("check header: %w", err)
+		return nil, fmt.Errorf("check header: %w", err)
 	}

-	c.streamMu.Lock()
+	c.mu.Lock()
+	if c.closed {
+		c.mu.Unlock()
+		return nil, backoff.Permanent(ErrClientClosed)
+	}
 	c.stream = stream
-	c.streamMu.Unlock()
+	c.mu.Unlock()
+	streamReady = true

-	return c.receive(stream, msgHandler)
+	return stream, nil
 }

 func (c *GRPCClient) receive(stream proto.FlowService_EventsClient, msgHandler func(msg *proto.FlowEventAck) error) error {
 	for {
 		msg, err := stream.Recv()
 		if err != nil {
-			return fmt.Errorf("receive from stream: %w", err)
+			return err
 		}

 		if msg.IsInitiator {
@@ -169,7 +292,7 @@ func checkHeader(stream proto.FlowService_EventsClient) error {
 func defaultBackoff(ctx context.Context, interval time.Duration) backoff.BackOff {
 	return backoff.WithContext(&backoff.ExponentialBackOff{
 		InitialInterval:     800 * time.Millisecond,
-		RandomizationFactor: 1,
+		RandomizationFactor: 0.5,
 		Multiplier:          1.7,
 		MaxInterval:         interval / 2,
 		MaxElapsedTime:      3 * 30 * 24 * time.Hour, // 3 months
@@ -178,18 +301,12 @@ func defaultBackoff(ctx context.Context, interval time.Duration) backoff.BackOff
 	}, ctx)
 }

-func (c *GRPCClient) Send(event *proto.FlowEvent) error {
-	c.streamMu.Lock()
-	stream := c.stream
-	c.streamMu.Unlock()
-
-	if stream == nil {
-		return errors.New("stream not initialized")
+func isContextDone(err error) bool {
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		return true
 	}
-
-	if err := stream.Send(event); err != nil {
-		return fmt.Errorf("send flow event: %w", err)
+	if s, ok := status.FromError(err); ok {
+		return s.Code() == codes.Canceled || s.Code() == codes.DeadlineExceeded
 	}
-
-	return nil
+	return false
 }
--- a/flow/client/client_test.go
+++ b/flow/client/client_test.go
@@ -2,8 +2,11 @@ package client_test

 import (
 	"context"
+	"encoding/binary"
 	"errors"
 	"net"
+	"sync"
+	"sync/atomic"
 	"testing"
 	"time"

@@ -11,6 +14,8 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"

 	flow "github.com/netbirdio/netbird/flow/client"
 	"github.com/netbirdio/netbird/flow/proto"
@@ -18,21 +23,89 @@ import (

 type testServer struct {
 	proto.UnimplementedFlowServiceServer
-	events  chan *proto.FlowEvent
-	acks    chan *proto.FlowEventAck
-	grpcSrv *grpc.Server
-	addr    string
+	events         chan *proto.FlowEvent
+	acks           chan *proto.FlowEventAck
+	grpcSrv        *grpc.Server
+	addr           string
+	listener       *connTrackListener
+	closeStream    chan struct{} // signal server to close the stream
+	handlerDone    chan struct{} // signaled each time Events() exits
+	handlerStarted chan struct{} // signaled each time Events() begins
+}
+
+// connTrackListener wraps a net.Listener to track accepted connections
+// so tests can forcefully close them to simulate PROTOCOL_ERROR/RST_STREAM.
+type connTrackListener struct {
+	net.Listener
+	mu    sync.Mutex
+	conns []net.Conn
+}
+
+func (l *connTrackListener) Accept() (net.Conn, error) {
+	c, err := l.Listener.Accept()
+	if err != nil {
+		return nil, err
+	}
+	l.mu.Lock()
+	l.conns = append(l.conns, c)
+	l.mu.Unlock()
+	return c, nil
+}
+
+// sendRSTStream writes a raw HTTP/2 RST_STREAM frame with PROTOCOL_ERROR
+// (error code 0x1) on every tracked connection. This produces the exact error:
+//
+//	rpc error: code = Internal desc = stream terminated by RST_STREAM with error code: PROTOCOL_ERROR
+//
+// HTTP/2 RST_STREAM frame format (9-byte header + 4-byte payload):
+//
+//	Length (3 bytes): 0x000004
+//	Type   (1 byte):  0x03 (RST_STREAM)
+//	Flags  (1 byte):  0x00
+//	Stream ID (4 bytes): target stream (must have bit 31 clear)
+//	Error Code (4 bytes): 0x00000001 (PROTOCOL_ERROR)
+func (l *connTrackListener) connCount() int {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	return len(l.conns)
+}
+
+func (l *connTrackListener) sendRSTStream(streamID uint32) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	frame := make([]byte, 13) // 9-byte header + 4-byte payload
+	// Length = 4 (3 bytes, big-endian)
+	frame[0], frame[1], frame[2] = 0, 0, 4
+	// Type = RST_STREAM (0x03)
+	frame[3] = 0x03
+	// Flags = 0
+	frame[4] = 0x00
+	// Stream ID (4 bytes, big-endian, bit 31 reserved = 0)
+	binary.BigEndian.PutUint32(frame[5:9], streamID)
+	// Error Code = PROTOCOL_ERROR (0x1)
+	binary.BigEndian.PutUint32(frame[9:13], 0x1)
+
+	for _, c := range l.conns {
+		_, _ = c.Write(frame)
+	}
 }

 func newTestServer(t *testing.T) *testServer {
-	listener, err := net.Listen("tcp", "127.0.0.1:0")
+	rawListener, err := net.Listen("tcp", "127.0.0.1:0")
 	require.NoError(t, err)

+	listener := &connTrackListener{Listener: rawListener}
+
 	s := &testServer{
-		events:  make(chan *proto.FlowEvent, 100),
-		acks:    make(chan *proto.FlowEventAck, 100),
-		grpcSrv: grpc.NewServer(),
-		addr:    listener.Addr().String(),
+		events:         make(chan *proto.FlowEvent, 100),
+		acks:           make(chan *proto.FlowEventAck, 100),
+		grpcSrv:        grpc.NewServer(),
+		addr:           rawListener.Addr().String(),
+		listener:       listener,
+		closeStream:    make(chan struct{}, 1),
+		handlerDone:    make(chan struct{}, 10),
+		handlerStarted: make(chan struct{}, 10),
 	}

 	proto.RegisterFlowServiceServer(s.grpcSrv, s)
@@ -51,11 +124,23 @@ func newTestServer(t *testing.T) *testServer {
 }

 func (s *testServer) Events(stream proto.FlowService_EventsServer) error {
+	defer func() {
+		select {
+		case s.handlerDone <- struct{}{}:
+		default:
+		}
+	}()
+
 	err := stream.Send(&proto.FlowEventAck{IsInitiator: true})
 	if err != nil {
 		return err
 	}

+	select {
+	case s.handlerStarted <- struct{}{}:
+	default:
+	}
+
 	ctx, cancel := context.WithCancel(stream.Context())
 	defer cancel()

@@ -91,6 +176,8 @@ func (s *testServer) Events(stream proto.FlowService_EventsServer) error {
 			if err := stream.Send(ack); err != nil {
 				return err
 			}
+		case <-s.closeStream:
+			return status.Errorf(codes.Internal, "server closing stream")
 		case <-ctx.Done():
 			return ctx.Err()
 		}
@@ -110,16 +197,13 @@ func TestReceive(t *testing.T) {
 		assert.NoError(t, err, "failed to close flow")
 	})

-	receivedAcks := make(map[string]bool)
+	var ackCount atomic.Int32
 	receiveDone := make(chan struct{})

 	go func() {
 		err := client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
 			if !msg.IsInitiator && len(msg.EventId) > 0 {
-				id := string(msg.EventId)
-				receivedAcks[id] = true
-
-				if len(receivedAcks) >= 3 {
+				if ackCount.Add(1) >= 3 {
 					close(receiveDone)
 				}
 			}
@@ -130,7 +214,11 @@ func TestReceive(t *testing.T) {
 		}
 	}()

-	time.Sleep(500 * time.Millisecond)
+	select {
+	case <-server.handlerStarted:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timeout waiting for stream to be established")
+	}

 	for i := 0; i < 3; i++ {
 		eventID := uuid.New().String()
@@ -153,7 +241,7 @@ func TestReceive(t *testing.T) {
 		t.Fatal("timeout waiting for acks to be processed")
 	}

-	assert.Equal(t, 3, len(receivedAcks))
+	assert.Equal(t, int32(3), ackCount.Load())
 }

 func TestReceive_ContextCancellation(t *testing.T) {
@@ -254,3 +342,195 @@ func TestSend(t *testing.T) {
 		t.Fatal("timeout waiting for ack to be received by flow")
 	}
 }
+
+func TestNewClient_PermanentClose(t *testing.T) {
+	server := newTestServer(t)
+
+	client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
+	require.NoError(t, err)
+
+	err = client.Close()
+	require.NoError(t, err)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	t.Cleanup(cancel)
+
+	done := make(chan error, 1)
+	go func() {
+		done <- client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
+			return nil
+		})
+	}()
+
+	select {
+	case err := <-done:
+		require.ErrorIs(t, err, flow.ErrClientClosed)
+	case <-time.After(2 * time.Second):
+		t.Fatal("Receive did not return after Close — stuck in retry loop")
+	}
+}
+
+func TestNewClient_CloseVerify(t *testing.T) {
+	server := newTestServer(t)
+
+	client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
+	require.NoError(t, err)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	t.Cleanup(cancel)
+
+	done := make(chan error, 1)
+	go func() {
+		done <- client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
+			return nil
+		})
+	}()
+
+	closeDone := make(chan struct{}, 1)
+	go func() {
+		_ = client.Close()
+		closeDone <- struct{}{}
+	}()
+
+	select {
+	case err := <-done:
+		require.Error(t, err)
+	case <-time.After(2 * time.Second):
+		t.Fatal("Receive did not return after Close — stuck in retry loop")
+	}
+
+	select {
+	case <-closeDone:
+		return
+	case <-time.After(2 * time.Second):
+		t.Fatal("Close did not return — blocked in retry loop")
+	}
+
+}
+
+func TestClose_WhileReceiving(t *testing.T) {
+	server := newTestServer(t)
+	client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
+	require.NoError(t, err)
+
+	ctx := context.Background() // no timeout — intentional
+	receiveDone := make(chan struct{})
+	go func() {
+		_ = client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
+			return nil
+		})
+		close(receiveDone)
+	}()
+
+	// Wait for the server-side handler to confirm the stream is established.
+	select {
+	case <-server.handlerStarted:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timeout waiting for stream to be established")
+	}
+
+	closeDone := make(chan struct{})
+	go func() {
+		_ = client.Close()
+		close(closeDone)
+	}()
+
+	select {
+	case <-closeDone:
+		// Close returned — good
+	case <-time.After(2 * time.Second):
+		t.Fatal("Close blocked forever — Receive stuck in retry loop")
+	}
+
+	select {
+	case <-receiveDone:
+	case <-time.After(2 * time.Second):
+		t.Fatal("Receive did not exit after Close")
+	}
+}
+
+func TestReceive_ProtocolErrorStreamReconnect(t *testing.T) {
+	server := newTestServer(t)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	t.Cleanup(cancel)
+
+	client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		err := client.Close()
+		assert.NoError(t, err, "failed to close flow")
+	})
+
+	// Track acks received before and after server-side stream close
+	var ackCount atomic.Int32
+	receivedFirst := make(chan struct{})
+	receivedAfterReconnect := make(chan struct{})
+
+	go func() {
+		err := client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
+			if msg.IsInitiator || len(msg.EventId) == 0 {
+				return nil
+			}
+			n := ackCount.Add(1)
+			if n == 1 {
+				close(receivedFirst)
+			}
+			if n == 2 {
+				close(receivedAfterReconnect)
+			}
+			return nil
+		})
+		if err != nil && !errors.Is(err, context.Canceled) {
+			t.Logf("receive error: %v", err)
+		}
+	}()
+
+	// Wait for stream to be established, then send first ack
+	select {
+	case <-server.handlerStarted:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timeout waiting for stream to be established")
+	}
+	server.acks <- &proto.FlowEventAck{EventId: []byte("before-close")}
+
+	select {
+	case <-receivedFirst:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timeout waiting for first ack")
+	}
+
+	// Snapshot connection count before injecting the fault.
+	connsBefore := server.listener.connCount()
+
+	// Send a raw HTTP/2 RST_STREAM frame with PROTOCOL_ERROR on the TCP connection.
+	// gRPC multiplexes streams on stream IDs 1, 3, 5, ... (odd, client-initiated).
+	// Stream ID 1 is the client's first stream (our Events bidi stream).
+	// This produces the exact error the client sees in production:
+	//   "stream terminated by RST_STREAM with error code: PROTOCOL_ERROR"
+	server.listener.sendRSTStream(1)
+
+	// Wait for the old Events() handler to fully exit so it can no longer
+	// drain s.acks and drop our injected ack on a broken stream.
+	select {
+	case <-server.handlerDone:
+	case <-time.After(5 * time.Second):
+		t.Fatal("old Events() handler did not exit after RST_STREAM")
+	}
+
+	require.Eventually(t, func() bool {
+		return server.listener.connCount() > connsBefore
+	}, 5*time.Second, 50*time.Millisecond, "client did not open a new TCP connection after RST_STREAM")
+
+	server.acks <- &proto.FlowEventAck{EventId: []byte("after-close")}
+
+	select {
+	case <-receivedAfterReconnect:
+		// Client successfully reconnected and received ack after server-side stream close
+	case <-time.After(5 * time.Second):
+		t.Fatal("timeout waiting for ack after server-side stream close — client did not reconnect")
+	}
+
+	assert.GreaterOrEqual(t, int(ackCount.Load()), 2, "should have received acks before and after stream close")
+	assert.GreaterOrEqual(t, server.listener.connCount(), 2, "client should have created at least 2 TCP connections (original + reconnect)")
+}
--- a/infrastructure_files/getting-started.sh
+++ b/infrastructure_files/getting-started.sh
@@ -182,6 +182,23 @@ read_enable_proxy() {
  return 0
 }

+read_enable_crowdsec() {
+  echo "" > /dev/stderr
+  echo "Do you want to enable CrowdSec IP reputation blocking?" > /dev/stderr
+  echo "CrowdSec checks client IPs against a community threat intelligence database" > /dev/stderr
+  echo "and blocks known malicious sources before they reach your services." > /dev/stderr
+  echo "A local CrowdSec LAPI container will be added to your deployment." > /dev/stderr
+  echo -n "Enable CrowdSec? [y/N]: " > /dev/stderr
+  read -r CHOICE < /dev/tty
+
+  if [[ "$CHOICE" =~ ^[Yy]$ ]]; then
+    echo "true"
+  else
+    echo "false"
+  fi
+  return 0
+}
+
 read_traefik_acme_email() {
  echo "" > /dev/stderr
  echo "Enter your email for Let's Encrypt certificate notifications." > /dev/stderr
@@ -297,6 +314,10 @@ initialize_default_values() {
  # NetBird Proxy configuration
  ENABLE_PROXY="false"
  PROXY_TOKEN=""
+
+  # CrowdSec configuration
+  ENABLE_CROWDSEC="false"
+  CROWDSEC_BOUNCER_KEY=""
  return 0
 }

@@ -325,6 +346,9 @@ configure_reverse_proxy() {
  if [[ "$REVERSE_PROXY_TYPE" == "0" ]]; then
    TRAEFIK_ACME_EMAIL=$(read_traefik_acme_email)
    ENABLE_PROXY=$(read_enable_proxy)
+    if [[ "$ENABLE_PROXY" == "true" ]]; then
+      ENABLE_CROWDSEC=$(read_enable_crowdsec)
+    fi
  fi

  # Handle external Traefik-specific prompts (option 1)
@@ -354,7 +378,7 @@ check_existing_installation() {
    echo "Generated files already exist, if you want to reinitialize the environment, please remove them first."
    echo "You can use the following commands:"
    echo "  $DOCKER_COMPOSE_COMMAND down --volumes # to remove all containers and volumes"
-    echo "  rm -f docker-compose.yml dashboard.env config.yaml proxy.env traefik-dynamic.yaml nginx-netbird.conf caddyfile-netbird.txt npm-advanced-config.txt"
+    echo "  rm -f docker-compose.yml dashboard.env config.yaml proxy.env traefik-dynamic.yaml nginx-netbird.conf caddyfile-netbird.txt npm-advanced-config.txt && rm -rf crowdsec/"
    echo "Be aware that this will remove all data from the database, and you will have to reconfigure the dashboard."
    exit 1
  fi
@@ -375,6 +399,9 @@ generate_configuration_files() {
        echo "NB_PROXY_TOKEN=placeholder" >> proxy.env
        # TCP ServersTransport for PROXY protocol v2 to the proxy backend
        render_traefik_dynamic > traefik-dynamic.yaml
+        if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+          mkdir -p crowdsec
+        fi
      fi
      ;;
    1)
@@ -417,8 +444,12 @@ start_services_and_show_instructions() {

    if [[ "$ENABLE_PROXY" == "true" ]]; then
      # Phase 1: Start core services (without proxy)
+      local core_services="traefik dashboard netbird-server"
+      if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+        core_services="$core_services crowdsec"
+      fi
      echo "Starting core services..."
-      $DOCKER_COMPOSE_COMMAND up -d traefik dashboard netbird-server
+      $DOCKER_COMPOSE_COMMAND up -d $core_services

      sleep 3
      wait_management_proxy traefik
@@ -438,7 +469,33 @@ start_services_and_show_instructions() {

      echo "Proxy token created successfully."

-      # Generate proxy.env with the token
+      if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+        echo "Registering CrowdSec bouncer..."
+        local cs_retries=0
+        while ! $DOCKER_COMPOSE_COMMAND exec -T crowdsec cscli capi status >/dev/null 2>&1; do
+          cs_retries=$((cs_retries + 1))
+          if [[ $cs_retries -ge 30 ]]; then
+            echo "WARNING: CrowdSec did not become ready. Skipping CrowdSec setup." > /dev/stderr
+            echo "You can register a bouncer manually later with:" > /dev/stderr
+            echo "  docker exec netbird-crowdsec cscli bouncers add netbird-proxy -o raw" > /dev/stderr
+            ENABLE_CROWDSEC="false"
+            break
+          fi
+          sleep 2
+        done
+
+        if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+          CROWDSEC_BOUNCER_KEY=$($DOCKER_COMPOSE_COMMAND exec -T crowdsec \
+            cscli bouncers add netbird-proxy -o raw 2>/dev/null)
+          if [[ -z "$CROWDSEC_BOUNCER_KEY" ]]; then
+            echo "WARNING: Failed to create CrowdSec bouncer key. Skipping CrowdSec setup." > /dev/stderr
+            ENABLE_CROWDSEC="false"
+          else
+            echo "CrowdSec bouncer registered."
+          fi
+        fi
+      fi
+
      render_proxy_env > proxy.env

      # Start proxy service
@@ -525,11 +582,25 @@ render_docker_compose_traefik_builtin() {
  # Generate proxy service section and Traefik dynamic config if enabled
  local proxy_service=""
  local proxy_volumes=""
+  local crowdsec_service=""
+  local crowdsec_volumes=""
  local traefik_file_provider=""
  local traefik_dynamic_volume=""
  if [[ "$ENABLE_PROXY" == "true" ]]; then
    traefik_file_provider='      - "--providers.file.filename=/etc/traefik/dynamic.yaml"'
    traefik_dynamic_volume="      - ./traefik-dynamic.yaml:/etc/traefik/dynamic.yaml:ro"
+
+    local proxy_depends="
+      netbird-server:
+        condition: service_started"
+    if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+      proxy_depends="
+      netbird-server:
+        condition: service_started
+      crowdsec:
+        condition: service_healthy"
+    fi
+
    proxy_service="
  # NetBird Proxy - exposes internal resources to the internet
  proxy:
@@ -539,8 +610,7 @@ render_docker_compose_traefik_builtin() {
    - 51820:51820/udp
    restart: unless-stopped
    networks: [netbird]
-    depends_on:
-      - netbird-server
+    depends_on:${proxy_depends}
    env_file:
      - ./proxy.env
    volumes:
@@ -563,6 +633,35 @@ render_docker_compose_traefik_builtin() {
 "
    proxy_volumes="
  netbird_proxy_certs:"
+
+    if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+      crowdsec_service="
+  crowdsec:
+    image: crowdsecurity/crowdsec:v1.7.7
+    container_name: netbird-crowdsec
+    restart: unless-stopped
+    networks: [netbird]
+    environment:
+      COLLECTIONS: crowdsecurity/linux
+    volumes:
+      - ./crowdsec:/etc/crowdsec
+      - crowdsec_db:/var/lib/crowdsec/data
+    healthcheck:
+      test: ["CMD", "cscli", "lapi", "status"]
+      interval: 10s
+      timeout: 5s
+      retries: 15
+    labels:
+      - traefik.enable=false
+    logging:
+      driver: \"json-file\"
+      options:
+        max-size: \"500m\"
+        max-file: \"2\"
+"
+      crowdsec_volumes="
+  crowdsec_db:"
+    fi
  fi

  cat <<EOF
@@ -675,10 +774,10 @@ $traefik_dynamic_volume
      options:
        max-size: "500m"
        max-file: "2"
-${proxy_service}
+${proxy_service}${crowdsec_service}
 volumes:
  netbird_data:
-  netbird_traefik_letsencrypt:${proxy_volumes}
+  netbird_traefik_letsencrypt:${proxy_volumes}${crowdsec_volumes}

 networks:
  netbird:
@@ -783,6 +882,14 @@ NB_PROXY_PROXY_PROTOCOL=true
 # Trust Traefik's IP for PROXY protocol headers
 NB_PROXY_TRUSTED_PROXIES=$TRAEFIK_IP
 EOF
+
+  if [[ "$ENABLE_CROWDSEC" == "true" && -n "$CROWDSEC_BOUNCER_KEY" ]]; then
+    cat <<EOF
+NB_PROXY_CROWDSEC_API_URL=http://crowdsec:8080
+NB_PROXY_CROWDSEC_API_KEY=$CROWDSEC_BOUNCER_KEY
+EOF
+  fi
+
  return 0
 }

@@ -1172,6 +1279,17 @@ print_builtin_traefik_instructions() {
    echo ""
    echo "  *.$NETBIRD_DOMAIN    CNAME    $NETBIRD_DOMAIN"
    echo ""
+    if [[ "$ENABLE_CROWDSEC" == "true" ]]; then
+      echo "CrowdSec IP Reputation:"
+      echo "  CrowdSec LAPI is running and connected to the community blocklist."
+      echo "  The proxy will automatically check client IPs against known threats."
+      echo "  Enable CrowdSec per-service in the dashboard under Access Control."
+      echo ""
+      echo "  To enroll in CrowdSec Console (optional, for dashboard and premium blocklists):"
+      echo "    docker exec netbird-crowdsec cscli console enroll <your-enrollment-key>"
+      echo "  Get your enrollment key at: https://app.crowdsec.net"
+      echo ""
+    fi
  fi
  return 0
 }
Author	SHA1	Message	Date
Viktor Liu	6f845fb84f	Update infrastructure_files/getting-started.sh Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>	2026-04-14 13:39:16 +02:00
Viktor Liu	49b24b9780	Add CrowdSec LAPI container to self-hosted setup script	2026-04-14 12:54:15 +02:00
Zoltan Papp	13539543af	[client] Fix/grpc retry (#5750 ) * [client] Fix flow client Receive retry loop not stopping after Close Use backoff.Permanent for canceled gRPC errors so Receive returns immediately instead of retrying until context deadline when the connection is already closed. Add TestNewClient_PermanentClose to verify the behavior. The connectivity.Shutdown check was meaningless because when the connection is shut down, c.realClient.Events(ctx, grpc.WaitForReady(true)) on the nex line already fails with codes.Canceled — which is now handled as a permanent error. The explicit state check was just duplicating what gRPC already reports through its normal error path. * [client] remove WaitForReady from stream open call grpc.WaitForReady(true) parks the RPC call internally until the connection reaches READY, only unblocking on ctx cancellation. This means the external backoff.Retry loop in Receive() never gets control back during a connection outage — it cannot tick, log, or apply its retry intervals while WaitForReady is blocking. Removing it restores fail-fast behaviour: Events() returns immediately with codes.Unavailable when the connection is not ready, which is exactly what the backoff loop expects. The backoff becomes the single authority over retry timing and cadence, as originally intended. * [client] Add connection recreation and improve flow client error handling Store gRPC dial options on the client to enable connection recreation on Internal errors (RST_STREAM/PROTOCOL_ERROR). Treat Unauthenticated, PermissionDenied, and Unimplemented as permanent failures. Unify mutex usage and add reconnection logging for better observability. * [client] Remove Unauthenticated, PermissionDenied, and Unimplemented from permanent error handling * [client] Fix error handling in Receive to properly re-establish stream and improve reconnection messaging * Fix test * [client] Add graceful shutdown handling and test for concurrent Close during Receive Prevent reconnection attempts after client closure by tracking a `closed` flag. Use `backoff.Permanent` for errors caused by operations on a closed client. Add a test to ensure `Close` does not block when `Receive` is actively running. * [client] Fix connection swap to properly close old gRPC connection Close the old `gRPC.ClientConn` after successfully swapping to a new connection during reconnection. * [client] Reset backoff * [client] Ensure stream closure on error during initialization * [client] Add test for handling server-side stream closure and reconnection Introduce `TestReceive_ServerClosesStream` to verify the client's ability to recover and process acknowledgments after the server closes the stream. Enhance test server with a controlled stream closure mechanism. * [client] Add protocol error simulation and enhance reconnection test Introduce `connTrackListener` to simulate HTTP/2 RST_STREAM with PROTOCOL_ERROR for testing. Refactor and rename `TestReceive_ServerClosesStream` to `TestReceive_ProtocolErrorStreamReconnect` to verify client recovery on protocol errors. * [client] Update Close error message in test for clarity * [client] Fine-tune the tests * [client] Adjust connection tracking in reconnection test * [client] Wait for Events handler to exit in RST_STREAM reconnection test Ensure the old `Events` handler exits fully before proceeding in the reconnection test to avoid dropped acknowledgments on a broken stream. Add a `handlerDone` channel to synchronize handler exits. * [client] Prevent panic on nil connection during Close * [client] Refactor connection handling to use explicit target tracking Introduce `target` field to store the gRPC connection target directly, simplifying reconnections and ensuring consistent connection reuse logic. * [client] Rename `isCancellation` to `isContextDone` and extend handling for `DeadlineExceeded` Refactor error handling to include `DeadlineExceeded` scenarios alongside `Canceled`. Update related condition checks for consistency. * [client] Add connection generation tracking to prevent stale reconnections Introduce `connGen` to track connection generations and ensure that stale `recreateConnection` calls do not override newer connections. Update stream establishment and reconnection logic to incorporate generation validation. * [client] Add backoff reset condition to prevent short-lived retry cycles Refine backoff reset logic to ensure it only occurs for sufficiently long-lived stream connections, avoiding interference with `MaxElapsedTime`. * [client] Introduce `minHealthyDuration` to refine backoff reset logic Add `minHealthyDuration` constant to ensure stream retries only reset the backoff timer if the stream survives beyond a minimum duration. Prevents unhealthy, short-lived streams from interfering with `MaxElapsedTime`. * [client] IPv6 friendly connection parsedURL.Hostname() strips IPv6 brackets. For http://[::1]:443, this turns it into ::1:443, which is not a valid host:port target for gRPC. Additionally, fmt.Sprintf("%s:%s", hostname, port) produces a trailing colon when the URL has no explicit port—http://example.com becomes example.com:. Both cases break the initial dial and reconnect paths. Use parsedURL.Host directly instead. * [client] Add `handlerStarted` channel to synchronize stream establishment in tests Introduce `handlerStarted` channel in the test server to signal when the server-side handler begins, ensuring robust synchronization between client and server during stream establishment. Update relevant test cases to wait for this signal before proceeding. * [client] Replace `receivedAcks` map with atomic counter and improve stream establishment sync in tests Refactor acknowledgment tracking in tests to use an `atomic.Int32` counter instead of a map. Replace fixed sleep with robust synchronization by waiting on `handlerStarted` signal for stream establishment. * [client] Extract `handleReceiveError` to simplify receive logic Refactor error handling in `receive` to a dedicated `handleReceiveError` method. Streamlines the main logic and isolates error recovery, including backoff reset and connection recreation. * [client] recreate gRPC ClientConn on every retry to prevent dual backoff The flow client had two competing retry loops: our custom exponential backoff and gRPC's internal subchannel reconnection. When establishStream failed, the same ClientConn was reused, allowing gRPC's internal backoff state to accumulate and control dial timing independently. Changes: - Consolidate error handling into handleRetryableError, which now handles context cancellation, permanent errors, backoff reset, and connection recreation in a single path - Call recreateConnection on every retryable error so each retry gets a fresh ClientConn with no internal backoff state - Remove connGen tracking since Receive is sequential and protected by a new receiving guard against concurrent calls - Reduce RandomizationFactor from 1 to 0.5 to avoid near-zero backoff intervals	2026-04-13 10:42:24 +02:00
Zoltan Papp	7483fec048	Fix Android internet blackhole caused by stale route re-injection on TUN rebuild (#5865 ) extraInitialRoutes() was meant to preserve only the fake IP route (240.0.0.0/8) across TUN rebuilds, but it re-injected any initial route missing from the current set. When the management server advertised exit node routes (0.0.0.0/0) that were later filtered by the route selector, extraInitialRoutes() re-added them, causing the Android VPN to capture all traffic with no peer to handle it. Store the fake IP route explicitly and append only that in notify(), removing the overly broad initial route diffing.	2026-04-13 09:38:38 +02:00