mirror of
https://github.com/netbirdio/netbird.git
synced 2026-03-31 14:44:34 -04:00
Compare commits
3 Commits
move-licen
...
dns-exit-n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2db23a42dc | ||
|
|
c2822eebb0 | ||
|
|
5b246e0a08 |
@@ -388,12 +388,12 @@ func generateDebugBundle(config *profilemanager.Config, recorder *peer.Status, c
|
||||
}
|
||||
|
||||
func init() {
|
||||
debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle")
|
||||
debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle")
|
||||
debugBundleCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle")
|
||||
debugBundleCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server")
|
||||
debugBundleCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle")
|
||||
|
||||
forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle")
|
||||
forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle")
|
||||
forCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle")
|
||||
forCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server")
|
||||
forCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle")
|
||||
|
||||
@@ -315,6 +315,10 @@ func (g *BundleGenerator) createArchive() error {
|
||||
return fmt.Errorf("add sync response: %w", err)
|
||||
}
|
||||
|
||||
if err := g.addDNSConfig(); err != nil {
|
||||
log.Errorf("failed to add DNS config to debug bundle: %v", err)
|
||||
}
|
||||
|
||||
if err := g.addStateFile(); err != nil {
|
||||
log.Errorf("failed to add state file to debug bundle: %v", err)
|
||||
}
|
||||
@@ -341,6 +345,50 @@ func (g *BundleGenerator) createArchive() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// addDNSConfig writes a dns_config.json snapshot with routed domains and NS group status
|
||||
func (g *BundleGenerator) addDNSConfig() error {
|
||||
type nsGroup struct {
|
||||
ID string `json:"id"`
|
||||
Servers []string `json:"servers"`
|
||||
Domains []string `json:"domains"`
|
||||
Enabled bool `json:"enabled"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
type dnsConfig struct {
|
||||
Groups []nsGroup `json:"name_server_groups"`
|
||||
}
|
||||
|
||||
if g.statusRecorder == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
states := g.statusRecorder.GetDNSStates()
|
||||
cfg := dnsConfig{Groups: make([]nsGroup, 0, len(states))}
|
||||
for _, st := range states {
|
||||
var servers []string
|
||||
for _, ap := range st.Servers {
|
||||
servers = append(servers, ap.String())
|
||||
}
|
||||
var errStr string
|
||||
if st.Error != nil {
|
||||
errStr = st.Error.Error()
|
||||
}
|
||||
cfg.Groups = append(cfg.Groups, nsGroup{
|
||||
ID: st.ID,
|
||||
Servers: servers,
|
||||
Domains: st.Domains,
|
||||
Enabled: st.Enabled,
|
||||
Error: errStr,
|
||||
})
|
||||
}
|
||||
|
||||
bs, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal dns config: %w", err)
|
||||
}
|
||||
return g.addFileToZip(bytes.NewReader(bs), "dns_config.json")
|
||||
}
|
||||
|
||||
func (g *BundleGenerator) addSystemInfo() {
|
||||
if err := g.addRoutes(); err != nil {
|
||||
log.Errorf("failed to add routes to debug bundle: %v", err)
|
||||
|
||||
@@ -46,6 +46,18 @@ type DNSForwarder struct {
|
||||
fwdEntries []*ForwarderEntry
|
||||
firewall firewaller
|
||||
resolver resolver
|
||||
|
||||
// failure rate tracking for routed domains
|
||||
failureMu sync.Mutex
|
||||
failureCounts map[string]int
|
||||
failureWindow time.Duration
|
||||
lastLogPerHost map[string]time.Time
|
||||
|
||||
// per-domain rolling stats and windows
|
||||
statsMu sync.Mutex
|
||||
stats map[string]*domainStats
|
||||
winSize time.Duration
|
||||
slowT time.Duration
|
||||
}
|
||||
|
||||
func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, statusRecorder *peer.Status) *DNSForwarder {
|
||||
@@ -56,9 +68,25 @@ func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, stat
|
||||
firewall: firewall,
|
||||
statusRecorder: statusRecorder,
|
||||
resolver: net.DefaultResolver,
|
||||
failureCounts: make(map[string]int),
|
||||
failureWindow: 10 * time.Second,
|
||||
lastLogPerHost: make(map[string]time.Time),
|
||||
stats: make(map[string]*domainStats),
|
||||
winSize: 10 * time.Second,
|
||||
slowT: 300 * time.Millisecond,
|
||||
}
|
||||
}
|
||||
|
||||
type domainStats struct {
|
||||
total int
|
||||
success int
|
||||
timeouts int
|
||||
notfound int
|
||||
failures int // other failures (incl. SERVFAIL-like)
|
||||
slow int
|
||||
lastLog time.Time
|
||||
}
|
||||
|
||||
func (f *DNSForwarder) Listen(entries []*ForwarderEntry) error {
|
||||
log.Infof("starting DNS forwarder on address=%s", f.listenAddress)
|
||||
|
||||
@@ -163,12 +191,19 @@ func (f *DNSForwarder) handleDNSQuery(w dns.ResponseWriter, query *dns.Msg) *dns
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), upstreamTimeout)
|
||||
defer cancel()
|
||||
start := time.Now()
|
||||
ips, err := f.resolver.LookupNetIP(ctx, network, domain)
|
||||
elapsed := time.Since(start)
|
||||
if err != nil {
|
||||
f.handleDNSError(ctx, w, question, resp, domain, err)
|
||||
// record error stats for routed domains
|
||||
f.recordErrorStats(strings.TrimSuffix(domain, "."), err)
|
||||
return nil
|
||||
}
|
||||
|
||||
// record success timing
|
||||
f.recordSuccessStats(strings.TrimSuffix(domain, "."), elapsed)
|
||||
|
||||
f.updateInternalState(ips, mostSpecificResId, matchingEntries)
|
||||
f.addIPsToResponse(resp, domain, ips)
|
||||
|
||||
@@ -306,6 +341,91 @@ func (f *DNSForwarder) handleDNSError(ctx context.Context, w dns.ResponseWriter,
|
||||
if err := w.WriteMsg(resp); err != nil {
|
||||
log.Errorf("failed to write failure DNS response: %v", err)
|
||||
}
|
||||
|
||||
// Track failure rate for routed domains only
|
||||
if resID, _ := f.getMatchingEntries(strings.TrimSuffix(domain, ".")); resID != "" {
|
||||
f.recordDomainFailure(strings.TrimSuffix(domain, "."))
|
||||
}
|
||||
}
|
||||
|
||||
// recordErrorStats updates per-domain counters and emits rate-limited logs
|
||||
func (f *DNSForwarder) recordErrorStats(domain string, err error) {
|
||||
domain = strings.ToLower(domain)
|
||||
f.statsMu.Lock()
|
||||
s := f.ensureStats(domain)
|
||||
s.total++
|
||||
|
||||
var dnsErr *net.DNSError
|
||||
if errors.As(err, &dnsErr) {
|
||||
if dnsErr.IsNotFound {
|
||||
s.notfound++
|
||||
} else if dnsErr.Timeout() {
|
||||
s.timeouts++
|
||||
} else {
|
||||
s.failures++
|
||||
}
|
||||
} else {
|
||||
s.failures++
|
||||
}
|
||||
|
||||
f.maybeLogDomainStats(domain, s)
|
||||
f.statsMu.Unlock()
|
||||
}
|
||||
|
||||
// recordSuccessStats updates per-domain latency stats and slow counters, logs if needed (rate-limited)
|
||||
func (f *DNSForwarder) recordSuccessStats(domain string, elapsed time.Duration) {
|
||||
domain = strings.ToLower(domain)
|
||||
f.statsMu.Lock()
|
||||
s := f.ensureStats(domain)
|
||||
s.total++
|
||||
s.success++
|
||||
if elapsed >= f.slowT {
|
||||
s.slow++
|
||||
}
|
||||
f.maybeLogDomainStats(domain, s)
|
||||
f.statsMu.Unlock()
|
||||
}
|
||||
|
||||
func (f *DNSForwarder) ensureStats(domain string) *domainStats {
|
||||
if ds, ok := f.stats[domain]; ok {
|
||||
return ds
|
||||
}
|
||||
ds := &domainStats{}
|
||||
f.stats[domain] = ds
|
||||
return ds
|
||||
}
|
||||
|
||||
// maybeLogDomainStats logs a compact summary per routed domain at most once per window
|
||||
func (f *DNSForwarder) maybeLogDomainStats(domain string, s *domainStats) {
|
||||
now := time.Now()
|
||||
if !s.lastLog.IsZero() && now.Sub(s.lastLog) < f.winSize {
|
||||
return
|
||||
}
|
||||
|
||||
// check if routed (avoid logging for non-routed domains)
|
||||
if resID, _ := f.getMatchingEntries(domain); resID == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// only log if something noteworthy happened in the window
|
||||
noteworthy := s.timeouts > 0 || s.notfound > 0 || s.failures > 0 || s.slow > 0
|
||||
if !noteworthy {
|
||||
s.lastLog = now
|
||||
return
|
||||
}
|
||||
|
||||
// warn on persistent problems, info otherwise
|
||||
levelWarn := s.timeouts >= 3 || s.failures >= 3
|
||||
if levelWarn {
|
||||
log.Warnf("[d] DNS stats: domain=%s total=%d ok=%d timeout=%d nxdomain=%d fail=%d slow=%d(>=%s)",
|
||||
domain, s.total, s.success, s.timeouts, s.notfound, s.failures, s.slow, f.slowT)
|
||||
} else {
|
||||
log.Infof("[d] DNS stats: domain=%s total=%d ok=%d timeout=%d nxdomain=%d fail=%d slow=%d(>=%s)",
|
||||
domain, s.total, s.success, s.timeouts, s.notfound, s.failures, s.slow, f.slowT)
|
||||
}
|
||||
|
||||
// reset counters for next window
|
||||
*s = domainStats{lastLog: now}
|
||||
}
|
||||
|
||||
// addIPsToResponse adds IP addresses to the DNS response as appropriate A or AAAA records
|
||||
@@ -341,6 +461,27 @@ func (f *DNSForwarder) addIPsToResponse(resp *dns.Msg, domain string, ips []neti
|
||||
}
|
||||
}
|
||||
|
||||
// recordDomainFailure increments failure count for the domain and logs at info/warn with throttling.
|
||||
func (f *DNSForwarder) recordDomainFailure(domain string) {
|
||||
domain = strings.ToLower(domain)
|
||||
|
||||
f.failureMu.Lock()
|
||||
defer f.failureMu.Unlock()
|
||||
|
||||
f.failureCounts[domain]++
|
||||
count := f.failureCounts[domain]
|
||||
|
||||
now := time.Now()
|
||||
last, ok := f.lastLogPerHost[domain]
|
||||
if ok && now.Sub(last) < f.failureWindow {
|
||||
return
|
||||
}
|
||||
f.lastLogPerHost[domain] = now
|
||||
|
||||
log.Warnf("[d] DNS failures observed for routed domain: domain=%s failures=%d/%s", domain, count, f.failureWindow)
|
||||
|
||||
}
|
||||
|
||||
// getMatchingEntries retrieves the resource IDs for a given domain.
|
||||
// It returns the most specific match and all matching resource IDs.
|
||||
func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*ForwarderEntry) {
|
||||
|
||||
@@ -21,9 +21,9 @@ import (
|
||||
"github.com/netbirdio/netbird/client/internal/ingressgw"
|
||||
"github.com/netbirdio/netbird/client/internal/relay"
|
||||
"github.com/netbirdio/netbird/client/proto"
|
||||
"github.com/netbirdio/netbird/route"
|
||||
"github.com/netbirdio/netbird/shared/management/domain"
|
||||
relayClient "github.com/netbirdio/netbird/shared/relay/client"
|
||||
"github.com/netbirdio/netbird/route"
|
||||
)
|
||||
|
||||
const eventQueueSize = 10
|
||||
@@ -201,6 +201,8 @@ type Status struct {
|
||||
resolvedDomainsStates map[domain.Domain]ResolvedDomainInfo
|
||||
lazyConnectionEnabled bool
|
||||
|
||||
lastDisconnectLog map[string]time.Time
|
||||
|
||||
// To reduce the number of notification invocation this bool will be true when need to call the notification
|
||||
// Some Peer actions mostly used by in a batch when the network map has been synchronized. In these type of events
|
||||
// set to true this variable and at the end of the processing we will reset it by the FinishPeerListModifications()
|
||||
@@ -229,6 +231,7 @@ func NewRecorder(mgmAddress string) *Status {
|
||||
notifier: newNotifier(),
|
||||
mgmAddress: mgmAddress,
|
||||
resolvedDomainsStates: map[domain.Domain]ResolvedDomainInfo{},
|
||||
lastDisconnectLog: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -487,6 +490,9 @@ func (d *Status) UpdatePeerRelayedStateToDisconnected(receivedState State) error
|
||||
|
||||
d.peers[receivedState.PubKey] = peerState
|
||||
|
||||
// info log about disconnect with impacted routes (throttled)
|
||||
d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState)
|
||||
|
||||
if hasConnStatusChanged(oldState, receivedState.ConnStatus) {
|
||||
d.notifyPeerListChanged()
|
||||
}
|
||||
@@ -519,6 +525,9 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error {
|
||||
|
||||
d.peers[receivedState.PubKey] = peerState
|
||||
|
||||
// info log about disconnect with impacted routes (throttled)
|
||||
d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState)
|
||||
|
||||
if hasConnStatusChanged(oldState, receivedState.ConnStatus) {
|
||||
d.notifyPeerListChanged()
|
||||
}
|
||||
@@ -529,6 +538,49 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// logPeerDisconnectIfNeeded logs an info message when a routing peer transitions to disconnected
|
||||
// with the number of impacted routes. Throttled to once per peer per 30 seconds.
|
||||
func (d *Status) logPeerDisconnectIfNeeded(pubKey string, state State) {
|
||||
if state.ConnStatus != StatusIdle {
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
last, ok := d.lastDisconnectLog[pubKey]
|
||||
if ok && now.Sub(last) < 10*time.Second {
|
||||
return
|
||||
}
|
||||
d.lastDisconnectLog[pubKey] = now
|
||||
|
||||
routes := state.GetRoutes()
|
||||
numRoutes := len(routes)
|
||||
|
||||
fqdn := state.FQDN
|
||||
if fqdn == "" {
|
||||
fqdn = pubKey
|
||||
}
|
||||
|
||||
// prepare a bounded list of impacted routes to avoid huge log lines
|
||||
maxList := 20
|
||||
list := make([]string, 0, maxList)
|
||||
for r := range routes {
|
||||
if len(list) >= maxList {
|
||||
break
|
||||
}
|
||||
list = append(list, r)
|
||||
}
|
||||
more := ""
|
||||
if numRoutes > len(list) {
|
||||
more = ", more=" + fmt.Sprintf("%d", numRoutes-len(list))
|
||||
}
|
||||
|
||||
if len(list) > 0 {
|
||||
log.Warnf("[d] Routing peer disconnected: peer=%s impacted_routes=%d routes=%v%s", fqdn, numRoutes, list, more)
|
||||
} else {
|
||||
log.Warnf("[d] Routing peer disconnected: peer=%s impacted_routes=%d", fqdn, numRoutes)
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateWireGuardPeerState updates the WireGuard bits of the peer state
|
||||
func (d *Status) UpdateWireGuardPeerState(pubKey string, wgStats configurer.WGStats) error {
|
||||
d.mux.Lock()
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
"github.com/netbirdio/netbird/formatter"
|
||||
)
|
||||
|
||||
const defaultLogSize = 15
|
||||
const defaultLogSize = 100
|
||||
|
||||
const (
|
||||
LogConsole = "console"
|
||||
|
||||
Reference in New Issue
Block a user