Add health-check agent recognition to avoid error logs (#4917)

Health-check connections now send a properly formatted auth message
with a well-known peer ID instead of immediately closing. The server
recognizes this peer ID and handles the connection gracefully with a
debug log instead of error logs.
This commit is contained in:
Zoltan Papp
2025-12-15 10:28:25 +01:00
committed by GitHub
parent 08f31fbcb3
commit 5748bdd64e
4 changed files with 53 additions and 4 deletions

View File

@@ -0,0 +1,31 @@
package peerid
import (
"crypto/sha256"
v2 "github.com/netbirdio/netbird/shared/relay/auth/hmac/v2"
"github.com/netbirdio/netbird/shared/relay/messages"
)
var (
// HealthCheckPeerID is the hashed peer ID for health check connections
HealthCheckPeerID = messages.HashID("healthcheck-agent")
// DummyAuthToken is a structurally valid auth token for health check.
// The signature is not valid but the format is correct (1 byte algo + 32 bytes signature + payload).
DummyAuthToken = createDummyToken()
)
func createDummyToken() []byte {
token := v2.Token{
AuthAlgo: v2.AuthAlgoHMACSHA256,
Signature: make([]byte, sha256.Size),
Payload: []byte("healthcheck"),
}
return token.Marshal()
}
// IsHealthCheck checks if the given peer ID is the health check agent
func IsHealthCheck(peerID *messages.PeerID) bool {
return peerID != nil && *peerID == HealthCheckPeerID
}

View File

@@ -7,8 +7,10 @@ import (
"github.com/coder/websocket"
"github.com/netbirdio/netbird/relay/healthcheck/peerid"
"github.com/netbirdio/netbird/relay/server"
"github.com/netbirdio/netbird/shared/relay"
"github.com/netbirdio/netbird/shared/relay/messages"
)
func dialWS(ctx context.Context, address url.URL) error {
@@ -30,7 +32,18 @@ func dialWS(ctx context.Context, address url.URL) error {
if err != nil {
return fmt.Errorf("failed to connect to websocket: %w", err)
}
defer func() {
_ = conn.CloseNow()
}()
authMsg, err := messages.MarshalAuthMsg(peerid.HealthCheckPeerID, peerid.DummyAuthToken)
if err != nil {
return fmt.Errorf("failed to marshal auth message: %w", err)
}
if err := conn.Write(ctx, websocket.MessageBinary, authMsg); err != nil {
return fmt.Errorf("failed to write auth message: %w", err)
}
_ = conn.Close(websocket.StatusNormalClosure, "availability check complete")
return nil
}

View File

@@ -97,7 +97,7 @@ func (h *handshake) handshakeReceive() (*messages.PeerID, error) {
return nil, fmt.Errorf("invalid message type %d from %s", msgType, h.conn.RemoteAddr())
}
if err != nil {
return nil, err
return peerID, err
}
h.peerID = peerID
return peerID, nil
@@ -147,7 +147,7 @@ func (h *handshake) handleAuthMsg(buf []byte) (*messages.PeerID, error) {
}
if err := h.validator.Validate(authPayload); err != nil {
return nil, fmt.Errorf("validate %s (%s): %w", rawPeerID.String(), h.conn.RemoteAddr(), err)
return rawPeerID, fmt.Errorf("validate %s (%s): %w", rawPeerID.String(), h.conn.RemoteAddr(), err)
}
return rawPeerID, nil

View File

@@ -12,6 +12,7 @@ import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/metric"
"github.com/netbirdio/netbird/relay/healthcheck/peerid"
//nolint:staticcheck
"github.com/netbirdio/netbird/relay/metrics"
"github.com/netbirdio/netbird/relay/server/store"
@@ -123,7 +124,11 @@ func (r *Relay) Accept(conn net.Conn) {
}
peerID, err := h.handshakeReceive()
if err != nil {
log.Errorf("failed to handshake: %s", err)
if peerid.IsHealthCheck(peerID) {
log.Debugf("health check connection from %s", conn.RemoteAddr())
} else {
log.Errorf("failed to handshake: %s", err)
}
if cErr := conn.Close(); cErr != nil {
log.Errorf("failed to close connection, %s: %s", conn.RemoteAddr(), cErr)
}