net/dns: cache dns.Config for reuse when compileConfig fails (#16059)

fixes tailscale/corp#25612

We now keep track of any dns configurations which we could not
compile. This gives RecompileDNSConfig a configuration to
attempt to recompile and apply when the OS pokes us to indicate
that the interface dns servers have changed/updated.   The manager config
will remain unset until we have the required information to compile
it correctly which should eliminate the problematic SERVFAIL
responses (especially on macOS 15).

This also removes the missingUpstreamRecovery func in the forwarder
which is no longer required now that we have proper error handling
and recovery manager and the client.

Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
Jonathan Nobels 2025-05-28 15:43:12 -04:00 committed by GitHub
parent ffc8ec289b
commit 5e54819cee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 76 additions and 61 deletions

View File

@ -25,7 +25,6 @@ import (
"tailscale.com/net/netmon" "tailscale.com/net/netmon"
"tailscale.com/net/tsdial" "tailscale.com/net/tsdial"
"tailscale.com/syncs" "tailscale.com/syncs"
"tailscale.com/tstime/rate"
"tailscale.com/types/dnstype" "tailscale.com/types/dnstype"
"tailscale.com/types/logger" "tailscale.com/types/logger"
"tailscale.com/util/clientmetric" "tailscale.com/util/clientmetric"
@ -63,10 +62,8 @@ type Manager struct {
knobs *controlknobs.Knobs // or nil knobs *controlknobs.Knobs // or nil
goos string // if empty, gets set to runtime.GOOS goos string // if empty, gets set to runtime.GOOS
mu sync.Mutex // guards following mu sync.Mutex // guards following
// config is the last configuration we successfully compiled or nil if there config *Config // Tracks the last viable DNS configuration set by Set. nil on failures other than compilation failures or if set has never been called.
// was any failure applying the last configuration.
config *Config
} }
// NewManagers created a new manager from the given config. // NewManagers created a new manager from the given config.
@ -93,22 +90,6 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
goos: goos, goos: goos,
} }
// Rate limit our attempts to correct our DNS configuration.
// This is done on incoming queries, we don't want to spam it.
limiter := rate.NewLimiter(1.0/5.0, 1)
// This will recompile the DNS config, which in turn will requery the system
// DNS settings. The recovery func should triggered only when we are missing
// upstream nameservers and require them to forward a query.
m.resolver.SetMissingUpstreamRecovery(func() {
if limiter.Allow() {
m.logf("resolution failed due to missing upstream nameservers. Recompiling DNS configuration.")
if err := m.RecompileDNSConfig(); err != nil {
m.logf("config recompilation failed: %v", err)
}
}
})
m.ctx, m.ctxCancel = context.WithCancel(context.Background()) m.ctx, m.ctxCancel = context.WithCancel(context.Background())
m.logf("using %T", m.os) m.logf("using %T", m.os)
return m return m
@ -117,7 +98,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
// Resolver returns the Manager's DNS Resolver. // Resolver returns the Manager's DNS Resolver.
func (m *Manager) Resolver() *resolver.Resolver { return m.resolver } func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
// RecompileDNSConfig sets the DNS config to the current value, which has // RecompileDNSConfig recompiles the last attempted DNS configuration, which has
// the side effect of re-querying the OS's interface nameservers. This should be used // the side effect of re-querying the OS's interface nameservers. This should be used
// on platforms where the interface nameservers can change. Darwin, for example, // on platforms where the interface nameservers can change. Darwin, for example,
// where the nameservers aren't always available when we process a major interface // where the nameservers aren't always available when we process a major interface
@ -127,14 +108,14 @@ func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
// give a better or different result than when [Manager.Set] was last called. The // give a better or different result than when [Manager.Set] was last called. The
// logic for making that determination is up to the caller. // logic for making that determination is up to the caller.
// //
// It returns [ErrNoDNSConfig] if the [Manager] has no existing DNS configuration. // It returns [ErrNoDNSConfig] if [Manager.Set] has never been called.
func (m *Manager) RecompileDNSConfig() error { func (m *Manager) RecompileDNSConfig() error {
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
if m.config == nil { if m.config != nil {
return ErrNoDNSConfig return m.setLocked(*m.config)
} }
return m.setLocked(*m.config) return ErrNoDNSConfig
} }
func (m *Manager) Set(cfg Config) error { func (m *Manager) Set(cfg Config) error {
@ -154,15 +135,15 @@ func (m *Manager) GetBaseConfig() (OSConfig, error) {
func (m *Manager) setLocked(cfg Config) error { func (m *Manager) setLocked(cfg Config) error {
syncs.AssertLocked(&m.mu) syncs.AssertLocked(&m.mu)
// On errors, the 'set' config is cleared.
m.config = nil
m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) { m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) {
cfg.WriteToBufioWriter(w) cfg.WriteToBufioWriter(w)
})) }))
rcfg, ocfg, err := m.compileConfig(cfg) rcfg, ocfg, err := m.compileConfig(cfg)
if err != nil { if err != nil {
// On a compilation failure, set m.config set for later reuse by
// [Manager.RecompileDNSConfig] and return the error.
m.config = &cfg
return err return err
} }
@ -174,9 +155,11 @@ func (m *Manager) setLocked(cfg Config) error {
})) }))
if err := m.resolver.SetConfig(rcfg); err != nil { if err := m.resolver.SetConfig(rcfg); err != nil {
m.config = nil
return err return err
} }
if err := m.os.SetDNS(ocfg); err != nil { if err := m.os.SetDNS(ocfg); err != nil {
m.config = nil
m.health.SetUnhealthy(osConfigurationSetWarnable, health.Args{health.ArgError: err.Error()}) m.health.SetUnhealthy(osConfigurationSetWarnable, health.Args{health.ArgError: err.Error()})
return err return err
} }
@ -355,7 +338,10 @@ func (m *Manager) compileConfig(cfg Config) (rcfg resolver.Config, ocfg OSConfig
// that as the forwarder for all DNS traffic that quad-100 doesn't handle. // that as the forwarder for all DNS traffic that quad-100 doesn't handle.
if isApple || !m.os.SupportsSplitDNS() { if isApple || !m.os.SupportsSplitDNS() {
// If the OS can't do native split-dns, read out the underlying // If the OS can't do native split-dns, read out the underlying
// resolver config and blend it into our config. // resolver config and blend it into our config. On apple platforms, [OSConfigurator.GetBaseConfig]
// has a tendency to temporarily fail if called immediately following
// an interface change. These failures should be retried if/when the OS
// indicates that the DNS configuration has changed via [RecompileDNSConfig].
cfg, err := m.os.GetBaseConfig() cfg, err := m.os.GetBaseConfig()
if err == nil { if err == nil {
baseCfg = &cfg baseCfg = &cfg

View File

@ -4,6 +4,7 @@
package dns package dns
import ( import (
"errors"
"net/netip" "net/netip"
"runtime" "runtime"
"strings" "strings"
@ -24,8 +25,9 @@ type fakeOSConfigurator struct {
SplitDNS bool SplitDNS bool
BaseConfig OSConfig BaseConfig OSConfig
OSConfig OSConfig OSConfig OSConfig
ResolverConfig resolver.Config ResolverConfig resolver.Config
GetBaseConfigErr *error
} }
func (c *fakeOSConfigurator) SetDNS(cfg OSConfig) error { func (c *fakeOSConfigurator) SetDNS(cfg OSConfig) error {
@ -45,6 +47,9 @@ func (c *fakeOSConfigurator) SupportsSplitDNS() bool {
} }
func (c *fakeOSConfigurator) GetBaseConfig() (OSConfig, error) { func (c *fakeOSConfigurator) GetBaseConfig() (OSConfig, error) {
if c.GetBaseConfigErr != nil {
return OSConfig{}, *c.GetBaseConfigErr
}
return c.BaseConfig, nil return c.BaseConfig, nil
} }
@ -1019,3 +1024,50 @@ func upstreams(strs ...string) (ret map[dnsname.FQDN][]*dnstype.Resolver) {
} }
return ret return ret
} }
func TestConfigRecompilation(t *testing.T) {
fakeErr := errors.New("fake os configurator error")
f := &fakeOSConfigurator{}
f.GetBaseConfigErr = &fakeErr
f.BaseConfig = OSConfig{
Nameservers: mustIPs("1.1.1.1"),
}
config := Config{
Routes: upstreams("ts.net", "69.4.2.0", "foo.ts.net", ""),
SearchDomains: fqdns("foo.ts.net"),
}
m := NewManager(t.Logf, f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "darwin")
var managerConfig *resolver.Config
m.resolver.TestOnlySetHook(func(cfg resolver.Config) {
managerConfig = &cfg
})
// Initial set should error out and store the config
if err := m.Set(config); err == nil {
t.Fatalf("Want non-nil error. Got nil")
}
if m.config == nil {
t.Fatalf("Want persisted config. Got nil.")
}
if managerConfig != nil {
t.Fatalf("Want nil managerConfig. Got %v", managerConfig)
}
// Clear the error. We should take the happy path now and
// set m.manager's Config.
f.GetBaseConfigErr = nil
// Recompilation without an error should succeed and set m.config and m.manager's [resolver.Config]
if err := m.RecompileDNSConfig(); err != nil {
t.Fatalf("Want nil error. Got err %v", err)
}
if m.config == nil {
t.Fatalf("Want non-nil config. Got nil")
}
if managerConfig == nil {
t.Fatalf("Want non nil managerConfig. Got nil")
}
}

View File

@ -245,12 +245,6 @@ type forwarder struct {
// /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub // /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub
// resolver lookup. // resolver lookup.
cloudHostFallback []resolverAndDelay cloudHostFallback []resolverAndDelay
// missingUpstreamRecovery, if non-nil, is set called when a SERVFAIL is
// returned due to missing upstream resolvers.
//
// This should attempt to properly (re)set the upstream resolvers.
missingUpstreamRecovery func()
} }
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder { func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
@ -258,13 +252,12 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
panic("nil netMon") panic("nil netMon")
} }
f := &forwarder{ f := &forwarder{
logf: logger.WithPrefix(logf, "forward: "), logf: logger.WithPrefix(logf, "forward: "),
netMon: netMon, netMon: netMon,
linkSel: linkSel, linkSel: linkSel,
dialer: dialer, dialer: dialer,
health: health, health: health,
controlKnobs: knobs, controlKnobs: knobs,
missingUpstreamRecovery: func() {},
} }
f.ctx, f.ctxCancel = context.WithCancel(context.Background()) f.ctx, f.ctxCancel = context.WithCancel(context.Background())
return f return f
@ -962,13 +955,6 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""}) f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
f.logf("no upstream resolvers set, returning SERVFAIL") f.logf("no upstream resolvers set, returning SERVFAIL")
// Attempt to recompile the DNS configuration
// If we are being asked to forward queries and we have no
// nameservers, the network is in a bad state.
if f.missingUpstreamRecovery != nil {
f.missingUpstreamRecovery()
}
res, err := servfailResponse(query) res, err := servfailResponse(query)
if err != nil { if err != nil {
return err return err

View File

@ -251,15 +251,6 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, h
return r return r
} }
// SetMissingUpstreamRecovery sets a callback to be called upon encountering
// a SERVFAIL due to missing upstream resolvers.
//
// This call should only happen before the resolver is used. It is not safe
// for concurrent use.
func (r *Resolver) SetMissingUpstreamRecovery(f func()) {
r.forwarder.missingUpstreamRecovery = f
}
func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook } func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook }
func (r *Resolver) SetConfig(cfg Config) error { func (r *Resolver) SetConfig(cfg Config) error {