From 5e54819ceecd789bce87c42b09b632932931d794 Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Wed, 28 May 2025 15:43:12 -0400 Subject: [PATCH] net/dns: cache dns.Config for reuse when compileConfig fails (#16059) fixes tailscale/corp#25612 We now keep track of any dns configurations which we could not compile. This gives RecompileDNSConfig a configuration to attempt to recompile and apply when the OS pokes us to indicate that the interface dns servers have changed/updated. The manager config will remain unset until we have the required information to compile it correctly which should eliminate the problematic SERVFAIL responses (especially on macOS 15). This also removes the missingUpstreamRecovery func in the forwarder which is no longer required now that we have proper error handling and recovery manager and the client. Signed-off-by: Jonathan Nobels --- net/dns/manager.go | 46 ++++++++++------------------ net/dns/manager_test.go | 56 +++++++++++++++++++++++++++++++++-- net/dns/resolver/forwarder.go | 26 ++++------------ net/dns/resolver/tsdns.go | 9 ------ 4 files changed, 76 insertions(+), 61 deletions(-) diff --git a/net/dns/manager.go b/net/dns/manager.go index 64bf12c6b..5d6f225ce 100644 --- a/net/dns/manager.go +++ b/net/dns/manager.go @@ -25,7 +25,6 @@ import ( "tailscale.com/net/netmon" "tailscale.com/net/tsdial" "tailscale.com/syncs" - "tailscale.com/tstime/rate" "tailscale.com/types/dnstype" "tailscale.com/types/logger" "tailscale.com/util/clientmetric" @@ -63,10 +62,8 @@ type Manager struct { knobs *controlknobs.Knobs // or nil goos string // if empty, gets set to runtime.GOOS - mu sync.Mutex // guards following - // config is the last configuration we successfully compiled or nil if there - // was any failure applying the last configuration. - config *Config + mu sync.Mutex // guards following + config *Config // Tracks the last viable DNS configuration set by Set. nil on failures other than compilation failures or if set has never been called. } // NewManagers created a new manager from the given config. @@ -93,22 +90,6 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, goos: goos, } - // Rate limit our attempts to correct our DNS configuration. - // This is done on incoming queries, we don't want to spam it. - limiter := rate.NewLimiter(1.0/5.0, 1) - - // This will recompile the DNS config, which in turn will requery the system - // DNS settings. The recovery func should triggered only when we are missing - // upstream nameservers and require them to forward a query. - m.resolver.SetMissingUpstreamRecovery(func() { - if limiter.Allow() { - m.logf("resolution failed due to missing upstream nameservers. Recompiling DNS configuration.") - if err := m.RecompileDNSConfig(); err != nil { - m.logf("config recompilation failed: %v", err) - } - } - }) - m.ctx, m.ctxCancel = context.WithCancel(context.Background()) m.logf("using %T", m.os) return m @@ -117,7 +98,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, // Resolver returns the Manager's DNS Resolver. func (m *Manager) Resolver() *resolver.Resolver { return m.resolver } -// RecompileDNSConfig sets the DNS config to the current value, which has +// RecompileDNSConfig recompiles the last attempted DNS configuration, which has // the side effect of re-querying the OS's interface nameservers. This should be used // on platforms where the interface nameservers can change. Darwin, for example, // where the nameservers aren't always available when we process a major interface @@ -127,14 +108,14 @@ func (m *Manager) Resolver() *resolver.Resolver { return m.resolver } // give a better or different result than when [Manager.Set] was last called. The // logic for making that determination is up to the caller. // -// It returns [ErrNoDNSConfig] if the [Manager] has no existing DNS configuration. +// It returns [ErrNoDNSConfig] if [Manager.Set] has never been called. func (m *Manager) RecompileDNSConfig() error { m.mu.Lock() defer m.mu.Unlock() - if m.config == nil { - return ErrNoDNSConfig + if m.config != nil { + return m.setLocked(*m.config) } - return m.setLocked(*m.config) + return ErrNoDNSConfig } func (m *Manager) Set(cfg Config) error { @@ -154,15 +135,15 @@ func (m *Manager) GetBaseConfig() (OSConfig, error) { func (m *Manager) setLocked(cfg Config) error { syncs.AssertLocked(&m.mu) - // On errors, the 'set' config is cleared. - m.config = nil - m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) { cfg.WriteToBufioWriter(w) })) rcfg, ocfg, err := m.compileConfig(cfg) if err != nil { + // On a compilation failure, set m.config set for later reuse by + // [Manager.RecompileDNSConfig] and return the error. + m.config = &cfg return err } @@ -174,9 +155,11 @@ func (m *Manager) setLocked(cfg Config) error { })) if err := m.resolver.SetConfig(rcfg); err != nil { + m.config = nil return err } if err := m.os.SetDNS(ocfg); err != nil { + m.config = nil m.health.SetUnhealthy(osConfigurationSetWarnable, health.Args{health.ArgError: err.Error()}) return err } @@ -355,7 +338,10 @@ func (m *Manager) compileConfig(cfg Config) (rcfg resolver.Config, ocfg OSConfig // that as the forwarder for all DNS traffic that quad-100 doesn't handle. if isApple || !m.os.SupportsSplitDNS() { // If the OS can't do native split-dns, read out the underlying - // resolver config and blend it into our config. + // resolver config and blend it into our config. On apple platforms, [OSConfigurator.GetBaseConfig] + // has a tendency to temporarily fail if called immediately following + // an interface change. These failures should be retried if/when the OS + // indicates that the DNS configuration has changed via [RecompileDNSConfig]. cfg, err := m.os.GetBaseConfig() if err == nil { baseCfg = &cfg diff --git a/net/dns/manager_test.go b/net/dns/manager_test.go index 2bdbc72e2..522f9636a 100644 --- a/net/dns/manager_test.go +++ b/net/dns/manager_test.go @@ -4,6 +4,7 @@ package dns import ( + "errors" "net/netip" "runtime" "strings" @@ -24,8 +25,9 @@ type fakeOSConfigurator struct { SplitDNS bool BaseConfig OSConfig - OSConfig OSConfig - ResolverConfig resolver.Config + OSConfig OSConfig + ResolverConfig resolver.Config + GetBaseConfigErr *error } func (c *fakeOSConfigurator) SetDNS(cfg OSConfig) error { @@ -45,6 +47,9 @@ func (c *fakeOSConfigurator) SupportsSplitDNS() bool { } func (c *fakeOSConfigurator) GetBaseConfig() (OSConfig, error) { + if c.GetBaseConfigErr != nil { + return OSConfig{}, *c.GetBaseConfigErr + } return c.BaseConfig, nil } @@ -1019,3 +1024,50 @@ func upstreams(strs ...string) (ret map[dnsname.FQDN][]*dnstype.Resolver) { } return ret } + +func TestConfigRecompilation(t *testing.T) { + fakeErr := errors.New("fake os configurator error") + f := &fakeOSConfigurator{} + f.GetBaseConfigErr = &fakeErr + f.BaseConfig = OSConfig{ + Nameservers: mustIPs("1.1.1.1"), + } + + config := Config{ + Routes: upstreams("ts.net", "69.4.2.0", "foo.ts.net", ""), + SearchDomains: fqdns("foo.ts.net"), + } + + m := NewManager(t.Logf, f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "darwin") + + var managerConfig *resolver.Config + m.resolver.TestOnlySetHook(func(cfg resolver.Config) { + managerConfig = &cfg + }) + + // Initial set should error out and store the config + if err := m.Set(config); err == nil { + t.Fatalf("Want non-nil error. Got nil") + } + if m.config == nil { + t.Fatalf("Want persisted config. Got nil.") + } + if managerConfig != nil { + t.Fatalf("Want nil managerConfig. Got %v", managerConfig) + } + + // Clear the error. We should take the happy path now and + // set m.manager's Config. + f.GetBaseConfigErr = nil + + // Recompilation without an error should succeed and set m.config and m.manager's [resolver.Config] + if err := m.RecompileDNSConfig(); err != nil { + t.Fatalf("Want nil error. Got err %v", err) + } + if m.config == nil { + t.Fatalf("Want non-nil config. Got nil") + } + if managerConfig == nil { + t.Fatalf("Want non nil managerConfig. Got nil") + } +} diff --git a/net/dns/resolver/forwarder.go b/net/dns/resolver/forwarder.go index 321401a84..c87fbd504 100644 --- a/net/dns/resolver/forwarder.go +++ b/net/dns/resolver/forwarder.go @@ -245,12 +245,6 @@ type forwarder struct { // /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub // resolver lookup. cloudHostFallback []resolverAndDelay - - // missingUpstreamRecovery, if non-nil, is set called when a SERVFAIL is - // returned due to missing upstream resolvers. - // - // This should attempt to properly (re)set the upstream resolvers. - missingUpstreamRecovery func() } func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder { @@ -258,13 +252,12 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS panic("nil netMon") } f := &forwarder{ - logf: logger.WithPrefix(logf, "forward: "), - netMon: netMon, - linkSel: linkSel, - dialer: dialer, - health: health, - controlKnobs: knobs, - missingUpstreamRecovery: func() {}, + logf: logger.WithPrefix(logf, "forward: "), + netMon: netMon, + linkSel: linkSel, + dialer: dialer, + health: health, + controlKnobs: knobs, } f.ctx, f.ctxCancel = context.WithCancel(context.Background()) return f @@ -962,13 +955,6 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""}) f.logf("no upstream resolvers set, returning SERVFAIL") - // Attempt to recompile the DNS configuration - // If we are being asked to forward queries and we have no - // nameservers, the network is in a bad state. - if f.missingUpstreamRecovery != nil { - f.missingUpstreamRecovery() - } - res, err := servfailResponse(query) if err != nil { return err diff --git a/net/dns/resolver/tsdns.go b/net/dns/resolver/tsdns.go index 107740b13..33fa9c3c0 100644 --- a/net/dns/resolver/tsdns.go +++ b/net/dns/resolver/tsdns.go @@ -251,15 +251,6 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, h return r } -// SetMissingUpstreamRecovery sets a callback to be called upon encountering -// a SERVFAIL due to missing upstream resolvers. -// -// This call should only happen before the resolver is used. It is not safe -// for concurrent use. -func (r *Resolver) SetMissingUpstreamRecovery(f func()) { - r.forwarder.missingUpstreamRecovery = f -} - func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook } func (r *Resolver) SetConfig(cfg Config) error {