net/dns: cache dns.Config for reuse when compileConfig fails (#16059)

fixes tailscale/corp#25612

We now keep track of any dns configurations which we could not
compile. This gives RecompileDNSConfig a configuration to
attempt to recompile and apply when the OS pokes us to indicate
that the interface dns servers have changed/updated.   The manager config
will remain unset until we have the required information to compile
it correctly which should eliminate the problematic SERVFAIL
responses (especially on macOS 15).

This also removes the missingUpstreamRecovery func in the forwarder
which is no longer required now that we have proper error handling
and recovery manager and the client.

Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
Jonathan Nobels 2025-05-28 15:43:12 -04:00 committed by GitHub
parent ffc8ec289b
commit 5e54819cee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 76 additions and 61 deletions

View File

@ -25,7 +25,6 @@ import (
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
"tailscale.com/syncs"
"tailscale.com/tstime/rate"
"tailscale.com/types/dnstype"
"tailscale.com/types/logger"
"tailscale.com/util/clientmetric"
@ -63,10 +62,8 @@ type Manager struct {
knobs *controlknobs.Knobs // or nil
goos string // if empty, gets set to runtime.GOOS
mu sync.Mutex // guards following
// config is the last configuration we successfully compiled or nil if there
// was any failure applying the last configuration.
config *Config
mu sync.Mutex // guards following
config *Config // Tracks the last viable DNS configuration set by Set. nil on failures other than compilation failures or if set has never been called.
}
// NewManagers created a new manager from the given config.
@ -93,22 +90,6 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
goos: goos,
}
// Rate limit our attempts to correct our DNS configuration.
// This is done on incoming queries, we don't want to spam it.
limiter := rate.NewLimiter(1.0/5.0, 1)
// This will recompile the DNS config, which in turn will requery the system
// DNS settings. The recovery func should triggered only when we are missing
// upstream nameservers and require them to forward a query.
m.resolver.SetMissingUpstreamRecovery(func() {
if limiter.Allow() {
m.logf("resolution failed due to missing upstream nameservers. Recompiling DNS configuration.")
if err := m.RecompileDNSConfig(); err != nil {
m.logf("config recompilation failed: %v", err)
}
}
})
m.ctx, m.ctxCancel = context.WithCancel(context.Background())
m.logf("using %T", m.os)
return m
@ -117,7 +98,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
// Resolver returns the Manager's DNS Resolver.
func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
// RecompileDNSConfig sets the DNS config to the current value, which has
// RecompileDNSConfig recompiles the last attempted DNS configuration, which has
// the side effect of re-querying the OS's interface nameservers. This should be used
// on platforms where the interface nameservers can change. Darwin, for example,
// where the nameservers aren't always available when we process a major interface
@ -127,14 +108,14 @@ func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
// give a better or different result than when [Manager.Set] was last called. The
// logic for making that determination is up to the caller.
//
// It returns [ErrNoDNSConfig] if the [Manager] has no existing DNS configuration.
// It returns [ErrNoDNSConfig] if [Manager.Set] has never been called.
func (m *Manager) RecompileDNSConfig() error {
m.mu.Lock()
defer m.mu.Unlock()
if m.config == nil {
return ErrNoDNSConfig
if m.config != nil {
return m.setLocked(*m.config)
}
return m.setLocked(*m.config)
return ErrNoDNSConfig
}
func (m *Manager) Set(cfg Config) error {
@ -154,15 +135,15 @@ func (m *Manager) GetBaseConfig() (OSConfig, error) {
func (m *Manager) setLocked(cfg Config) error {
syncs.AssertLocked(&m.mu)
// On errors, the 'set' config is cleared.
m.config = nil
m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) {
cfg.WriteToBufioWriter(w)
}))
rcfg, ocfg, err := m.compileConfig(cfg)
if err != nil {
// On a compilation failure, set m.config set for later reuse by
// [Manager.RecompileDNSConfig] and return the error.
m.config = &cfg
return err
}
@ -174,9 +155,11 @@ func (m *Manager) setLocked(cfg Config) error {
}))
if err := m.resolver.SetConfig(rcfg); err != nil {
m.config = nil
return err
}
if err := m.os.SetDNS(ocfg); err != nil {
m.config = nil
m.health.SetUnhealthy(osConfigurationSetWarnable, health.Args{health.ArgError: err.Error()})
return err
}
@ -355,7 +338,10 @@ func (m *Manager) compileConfig(cfg Config) (rcfg resolver.Config, ocfg OSConfig
// that as the forwarder for all DNS traffic that quad-100 doesn't handle.
if isApple || !m.os.SupportsSplitDNS() {
// If the OS can't do native split-dns, read out the underlying
// resolver config and blend it into our config.
// resolver config and blend it into our config. On apple platforms, [OSConfigurator.GetBaseConfig]
// has a tendency to temporarily fail if called immediately following
// an interface change. These failures should be retried if/when the OS
// indicates that the DNS configuration has changed via [RecompileDNSConfig].
cfg, err := m.os.GetBaseConfig()
if err == nil {
baseCfg = &cfg

View File

@ -4,6 +4,7 @@
package dns
import (
"errors"
"net/netip"
"runtime"
"strings"
@ -24,8 +25,9 @@ type fakeOSConfigurator struct {
SplitDNS bool
BaseConfig OSConfig
OSConfig OSConfig
ResolverConfig resolver.Config
OSConfig OSConfig
ResolverConfig resolver.Config
GetBaseConfigErr *error
}
func (c *fakeOSConfigurator) SetDNS(cfg OSConfig) error {
@ -45,6 +47,9 @@ func (c *fakeOSConfigurator) SupportsSplitDNS() bool {
}
func (c *fakeOSConfigurator) GetBaseConfig() (OSConfig, error) {
if c.GetBaseConfigErr != nil {
return OSConfig{}, *c.GetBaseConfigErr
}
return c.BaseConfig, nil
}
@ -1019,3 +1024,50 @@ func upstreams(strs ...string) (ret map[dnsname.FQDN][]*dnstype.Resolver) {
}
return ret
}
func TestConfigRecompilation(t *testing.T) {
fakeErr := errors.New("fake os configurator error")
f := &fakeOSConfigurator{}
f.GetBaseConfigErr = &fakeErr
f.BaseConfig = OSConfig{
Nameservers: mustIPs("1.1.1.1"),
}
config := Config{
Routes: upstreams("ts.net", "69.4.2.0", "foo.ts.net", ""),
SearchDomains: fqdns("foo.ts.net"),
}
m := NewManager(t.Logf, f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "darwin")
var managerConfig *resolver.Config
m.resolver.TestOnlySetHook(func(cfg resolver.Config) {
managerConfig = &cfg
})
// Initial set should error out and store the config
if err := m.Set(config); err == nil {
t.Fatalf("Want non-nil error. Got nil")
}
if m.config == nil {
t.Fatalf("Want persisted config. Got nil.")
}
if managerConfig != nil {
t.Fatalf("Want nil managerConfig. Got %v", managerConfig)
}
// Clear the error. We should take the happy path now and
// set m.manager's Config.
f.GetBaseConfigErr = nil
// Recompilation without an error should succeed and set m.config and m.manager's [resolver.Config]
if err := m.RecompileDNSConfig(); err != nil {
t.Fatalf("Want nil error. Got err %v", err)
}
if m.config == nil {
t.Fatalf("Want non-nil config. Got nil")
}
if managerConfig == nil {
t.Fatalf("Want non nil managerConfig. Got nil")
}
}

View File

@ -245,12 +245,6 @@ type forwarder struct {
// /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub
// resolver lookup.
cloudHostFallback []resolverAndDelay
// missingUpstreamRecovery, if non-nil, is set called when a SERVFAIL is
// returned due to missing upstream resolvers.
//
// This should attempt to properly (re)set the upstream resolvers.
missingUpstreamRecovery func()
}
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
@ -258,13 +252,12 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
panic("nil netMon")
}
f := &forwarder{
logf: logger.WithPrefix(logf, "forward: "),
netMon: netMon,
linkSel: linkSel,
dialer: dialer,
health: health,
controlKnobs: knobs,
missingUpstreamRecovery: func() {},
logf: logger.WithPrefix(logf, "forward: "),
netMon: netMon,
linkSel: linkSel,
dialer: dialer,
health: health,
controlKnobs: knobs,
}
f.ctx, f.ctxCancel = context.WithCancel(context.Background())
return f
@ -962,13 +955,6 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
f.logf("no upstream resolvers set, returning SERVFAIL")
// Attempt to recompile the DNS configuration
// If we are being asked to forward queries and we have no
// nameservers, the network is in a bad state.
if f.missingUpstreamRecovery != nil {
f.missingUpstreamRecovery()
}
res, err := servfailResponse(query)
if err != nil {
return err

View File

@ -251,15 +251,6 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, h
return r
}
// SetMissingUpstreamRecovery sets a callback to be called upon encountering
// a SERVFAIL due to missing upstream resolvers.
//
// This call should only happen before the resolver is used. It is not safe
// for concurrent use.
func (r *Resolver) SetMissingUpstreamRecovery(f func()) {
r.forwarder.missingUpstreamRecovery = f
}
func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook }
func (r *Resolver) SetConfig(cfg Config) error {