Compare commits

...

3 Commits

Author SHA1 Message Date
Hakan Sariman
2db23a42dc Add DNS configuration snapshot and per-domain statistics tracking 2025-09-09 17:09:13 +07:00
Hakan Sariman
c2822eebb0 [client] Enhance logging for peer disconnection events 2025-09-09 15:02:16 +07:00
Hakan Sariman
5b246e0a08 debug dns 2025-09-09 14:48:39 +07:00
5 changed files with 245 additions and 4 deletions

View File

@@ -388,12 +388,12 @@ func generateDebugBundle(config *profilemanager.Config, recorder *peer.Status, c
}
func init() {
debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle")
debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle")
debugBundleCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle")
debugBundleCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server")
debugBundleCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle")
forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle")
forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle")
forCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle")
forCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server")
forCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle")

View File

@@ -315,6 +315,10 @@ func (g *BundleGenerator) createArchive() error {
return fmt.Errorf("add sync response: %w", err)
}
if err := g.addDNSConfig(); err != nil {
log.Errorf("failed to add DNS config to debug bundle: %v", err)
}
if err := g.addStateFile(); err != nil {
log.Errorf("failed to add state file to debug bundle: %v", err)
}
@@ -341,6 +345,50 @@ func (g *BundleGenerator) createArchive() error {
return nil
}
// addDNSConfig writes a dns_config.json snapshot with routed domains and NS group status
func (g *BundleGenerator) addDNSConfig() error {
type nsGroup struct {
ID string `json:"id"`
Servers []string `json:"servers"`
Domains []string `json:"domains"`
Enabled bool `json:"enabled"`
Error string `json:"error,omitempty"`
}
type dnsConfig struct {
Groups []nsGroup `json:"name_server_groups"`
}
if g.statusRecorder == nil {
return nil
}
states := g.statusRecorder.GetDNSStates()
cfg := dnsConfig{Groups: make([]nsGroup, 0, len(states))}
for _, st := range states {
var servers []string
for _, ap := range st.Servers {
servers = append(servers, ap.String())
}
var errStr string
if st.Error != nil {
errStr = st.Error.Error()
}
cfg.Groups = append(cfg.Groups, nsGroup{
ID: st.ID,
Servers: servers,
Domains: st.Domains,
Enabled: st.Enabled,
Error: errStr,
})
}
bs, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("marshal dns config: %w", err)
}
return g.addFileToZip(bytes.NewReader(bs), "dns_config.json")
}
func (g *BundleGenerator) addSystemInfo() {
if err := g.addRoutes(); err != nil {
log.Errorf("failed to add routes to debug bundle: %v", err)

View File

@@ -46,6 +46,18 @@ type DNSForwarder struct {
fwdEntries []*ForwarderEntry
firewall firewaller
resolver resolver
// failure rate tracking for routed domains
failureMu sync.Mutex
failureCounts map[string]int
failureWindow time.Duration
lastLogPerHost map[string]time.Time
// per-domain rolling stats and windows
statsMu sync.Mutex
stats map[string]*domainStats
winSize time.Duration
slowT time.Duration
}
func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, statusRecorder *peer.Status) *DNSForwarder {
@@ -56,9 +68,25 @@ func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, stat
firewall: firewall,
statusRecorder: statusRecorder,
resolver: net.DefaultResolver,
failureCounts: make(map[string]int),
failureWindow: 10 * time.Second,
lastLogPerHost: make(map[string]time.Time),
stats: make(map[string]*domainStats),
winSize: 10 * time.Second,
slowT: 300 * time.Millisecond,
}
}
type domainStats struct {
total int
success int
timeouts int
notfound int
failures int // other failures (incl. SERVFAIL-like)
slow int
lastLog time.Time
}
func (f *DNSForwarder) Listen(entries []*ForwarderEntry) error {
log.Infof("starting DNS forwarder on address=%s", f.listenAddress)
@@ -163,12 +191,19 @@ func (f *DNSForwarder) handleDNSQuery(w dns.ResponseWriter, query *dns.Msg) *dns
ctx, cancel := context.WithTimeout(context.Background(), upstreamTimeout)
defer cancel()
start := time.Now()
ips, err := f.resolver.LookupNetIP(ctx, network, domain)
elapsed := time.Since(start)
if err != nil {
f.handleDNSError(ctx, w, question, resp, domain, err)
// record error stats for routed domains
f.recordErrorStats(strings.TrimSuffix(domain, "."), err)
return nil
}
// record success timing
f.recordSuccessStats(strings.TrimSuffix(domain, "."), elapsed)
f.updateInternalState(ips, mostSpecificResId, matchingEntries)
f.addIPsToResponse(resp, domain, ips)
@@ -306,6 +341,91 @@ func (f *DNSForwarder) handleDNSError(ctx context.Context, w dns.ResponseWriter,
if err := w.WriteMsg(resp); err != nil {
log.Errorf("failed to write failure DNS response: %v", err)
}
// Track failure rate for routed domains only
if resID, _ := f.getMatchingEntries(strings.TrimSuffix(domain, ".")); resID != "" {
f.recordDomainFailure(strings.TrimSuffix(domain, "."))
}
}
// recordErrorStats updates per-domain counters and emits rate-limited logs
func (f *DNSForwarder) recordErrorStats(domain string, err error) {
domain = strings.ToLower(domain)
f.statsMu.Lock()
s := f.ensureStats(domain)
s.total++
var dnsErr *net.DNSError
if errors.As(err, &dnsErr) {
if dnsErr.IsNotFound {
s.notfound++
} else if dnsErr.Timeout() {
s.timeouts++
} else {
s.failures++
}
} else {
s.failures++
}
f.maybeLogDomainStats(domain, s)
f.statsMu.Unlock()
}
// recordSuccessStats updates per-domain latency stats and slow counters, logs if needed (rate-limited)
func (f *DNSForwarder) recordSuccessStats(domain string, elapsed time.Duration) {
domain = strings.ToLower(domain)
f.statsMu.Lock()
s := f.ensureStats(domain)
s.total++
s.success++
if elapsed >= f.slowT {
s.slow++
}
f.maybeLogDomainStats(domain, s)
f.statsMu.Unlock()
}
func (f *DNSForwarder) ensureStats(domain string) *domainStats {
if ds, ok := f.stats[domain]; ok {
return ds
}
ds := &domainStats{}
f.stats[domain] = ds
return ds
}
// maybeLogDomainStats logs a compact summary per routed domain at most once per window
func (f *DNSForwarder) maybeLogDomainStats(domain string, s *domainStats) {
now := time.Now()
if !s.lastLog.IsZero() && now.Sub(s.lastLog) < f.winSize {
return
}
// check if routed (avoid logging for non-routed domains)
if resID, _ := f.getMatchingEntries(domain); resID == "" {
return
}
// only log if something noteworthy happened in the window
noteworthy := s.timeouts > 0 || s.notfound > 0 || s.failures > 0 || s.slow > 0
if !noteworthy {
s.lastLog = now
return
}
// warn on persistent problems, info otherwise
levelWarn := s.timeouts >= 3 || s.failures >= 3
if levelWarn {
log.Warnf("[d] DNS stats: domain=%s total=%d ok=%d timeout=%d nxdomain=%d fail=%d slow=%d(>=%s)",
domain, s.total, s.success, s.timeouts, s.notfound, s.failures, s.slow, f.slowT)
} else {
log.Infof("[d] DNS stats: domain=%s total=%d ok=%d timeout=%d nxdomain=%d fail=%d slow=%d(>=%s)",
domain, s.total, s.success, s.timeouts, s.notfound, s.failures, s.slow, f.slowT)
}
// reset counters for next window
*s = domainStats{lastLog: now}
}
// addIPsToResponse adds IP addresses to the DNS response as appropriate A or AAAA records
@@ -341,6 +461,27 @@ func (f *DNSForwarder) addIPsToResponse(resp *dns.Msg, domain string, ips []neti
}
}
// recordDomainFailure increments failure count for the domain and logs at info/warn with throttling.
func (f *DNSForwarder) recordDomainFailure(domain string) {
domain = strings.ToLower(domain)
f.failureMu.Lock()
defer f.failureMu.Unlock()
f.failureCounts[domain]++
count := f.failureCounts[domain]
now := time.Now()
last, ok := f.lastLogPerHost[domain]
if ok && now.Sub(last) < f.failureWindow {
return
}
f.lastLogPerHost[domain] = now
log.Warnf("[d] DNS failures observed for routed domain: domain=%s failures=%d/%s", domain, count, f.failureWindow)
}
// getMatchingEntries retrieves the resource IDs for a given domain.
// It returns the most specific match and all matching resource IDs.
func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*ForwarderEntry) {

View File

@@ -21,9 +21,9 @@ import (
"github.com/netbirdio/netbird/client/internal/ingressgw"
"github.com/netbirdio/netbird/client/internal/relay"
"github.com/netbirdio/netbird/client/proto"
"github.com/netbirdio/netbird/route"
"github.com/netbirdio/netbird/shared/management/domain"
relayClient "github.com/netbirdio/netbird/shared/relay/client"
"github.com/netbirdio/netbird/route"
)
const eventQueueSize = 10
@@ -201,6 +201,8 @@ type Status struct {
resolvedDomainsStates map[domain.Domain]ResolvedDomainInfo
lazyConnectionEnabled bool
lastDisconnectLog map[string]time.Time
// To reduce the number of notification invocation this bool will be true when need to call the notification
// Some Peer actions mostly used by in a batch when the network map has been synchronized. In these type of events
// set to true this variable and at the end of the processing we will reset it by the FinishPeerListModifications()
@@ -229,6 +231,7 @@ func NewRecorder(mgmAddress string) *Status {
notifier: newNotifier(),
mgmAddress: mgmAddress,
resolvedDomainsStates: map[domain.Domain]ResolvedDomainInfo{},
lastDisconnectLog: make(map[string]time.Time),
}
}
@@ -487,6 +490,9 @@ func (d *Status) UpdatePeerRelayedStateToDisconnected(receivedState State) error
d.peers[receivedState.PubKey] = peerState
// info log about disconnect with impacted routes (throttled)
d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState)
if hasConnStatusChanged(oldState, receivedState.ConnStatus) {
d.notifyPeerListChanged()
}
@@ -519,6 +525,9 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error {
d.peers[receivedState.PubKey] = peerState
// info log about disconnect with impacted routes (throttled)
d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState)
if hasConnStatusChanged(oldState, receivedState.ConnStatus) {
d.notifyPeerListChanged()
}
@@ -529,6 +538,49 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error {
return nil
}
// logPeerDisconnectIfNeeded logs an info message when a routing peer transitions to disconnected
// with the number of impacted routes. Throttled to once per peer per 30 seconds.
func (d *Status) logPeerDisconnectIfNeeded(pubKey string, state State) {
if state.ConnStatus != StatusIdle {
return
}
now := time.Now()
last, ok := d.lastDisconnectLog[pubKey]
if ok && now.Sub(last) < 10*time.Second {
return
}
d.lastDisconnectLog[pubKey] = now
routes := state.GetRoutes()
numRoutes := len(routes)
fqdn := state.FQDN
if fqdn == "" {
fqdn = pubKey
}
// prepare a bounded list of impacted routes to avoid huge log lines
maxList := 20
list := make([]string, 0, maxList)
for r := range routes {
if len(list) >= maxList {
break
}
list = append(list, r)
}
more := ""
if numRoutes > len(list) {
more = ", more=" + fmt.Sprintf("%d", numRoutes-len(list))
}
if len(list) > 0 {
log.Warnf("[d] Routing peer disconnected: peer=%s impacted_routes=%d routes=%v%s", fqdn, numRoutes, list, more)
} else {
log.Warnf("[d] Routing peer disconnected: peer=%s impacted_routes=%d", fqdn, numRoutes)
}
}
// UpdateWireGuardPeerState updates the WireGuard bits of the peer state
func (d *Status) UpdateWireGuardPeerState(pubKey string, wgStats configurer.WGStats) error {
d.mux.Lock()

View File

@@ -14,7 +14,7 @@ import (
"github.com/netbirdio/netbird/formatter"
)
const defaultLogSize = 15
const defaultLogSize = 100
const (
LogConsole = "console"