mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-29 04:55:31 +00:00
net/dns: recheck DNS config on SERVFAIL errors (#12547)
Fixes tailscale/corp#20677 Replaces the original attempt to rectify this (by injecting a netMon event) which was both heavy handed, and missed cases where the netMon event was "minor". On apple platforms, the fetching the interface's nameservers can and does return an empty list in certain situations. Apple's API in particular is very limiting here. The header hints at notifications for dns changes which would let us react ahead of time, but it's all private APIs. To avoid remaining in the state where we end up with no nameservers but we absolutely need them, we'll react to a lack of upstream nameservers by attempting to re-query the OS. We'll rate limit this to space out the attempts. It seems relatively harmless to attempt a reconfig every 5 seconds (triggered by an incoming query) if the network is in this broken state. Missing nameservers might possibly be a persistent condition (vs a transient error), but that would also imply that something out of our control is badly misconfigured. Tested by randomly returning [] for the nameservers. When switching between Wifi networks, or cell->wifi, this will randomly trigger the bug, and we appear to reliably heal the DNS state. Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
parent
d5e692f7e7
commit
27033c6277
@ -14,6 +14,7 @@
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
@ -23,6 +24,8 @@
|
||||
"tailscale.com/net/dns/resolver"
|
||||
"tailscale.com/net/netmon"
|
||||
"tailscale.com/net/tsdial"
|
||||
"tailscale.com/syncs"
|
||||
"tailscale.com/tstime/rate"
|
||||
"tailscale.com/types/dnstype"
|
||||
"tailscale.com/types/logger"
|
||||
"tailscale.com/util/clientmetric"
|
||||
@ -55,6 +58,12 @@ type Manager struct {
|
||||
os OSConfigurator
|
||||
knobs *controlknobs.Knobs // or nil
|
||||
goos string // if empty, gets set to runtime.GOOS
|
||||
|
||||
// The last configuration we successfully compiled. Set to nil if
|
||||
// there was any failure applying the last configuration
|
||||
config *Config
|
||||
// Must be held when accessing/setting config.
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// NewManagers created a new manager from the given config.
|
||||
@ -80,6 +89,26 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
|
||||
knobs: knobs,
|
||||
goos: goos,
|
||||
}
|
||||
|
||||
// Rate limit our attempts to correct our DNS configuration.
|
||||
limiter := rate.NewLimiter(1.0/5.0, 1)
|
||||
|
||||
// This will recompile the DNS config, which in turn will requery the system
|
||||
// DNS settings. The recovery func should triggered only when we are missing
|
||||
// upstream nameservers and require them to forward a query.
|
||||
m.resolver.SetMissingUpstreamRecovery(func() {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if m.config == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if limiter.Allow() {
|
||||
m.logf("DNS resolution failed due to missing upstream nameservers. Recompiling DNS configuration.")
|
||||
m.setLocked(*m.config)
|
||||
}
|
||||
})
|
||||
|
||||
m.ctx, m.ctxCancel = context.WithCancel(context.Background())
|
||||
m.logf("using %T", m.os)
|
||||
return m
|
||||
@ -89,6 +118,19 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
|
||||
func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
|
||||
|
||||
func (m *Manager) Set(cfg Config) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.setLocked(cfg)
|
||||
}
|
||||
|
||||
// Sets the DNS configuration.
|
||||
// m.mu must be held
|
||||
func (m *Manager) setLocked(cfg Config) error {
|
||||
syncs.AssertLocked(&m.mu)
|
||||
|
||||
// On errors, the 'set' config is cleared.
|
||||
m.config = nil
|
||||
|
||||
m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) {
|
||||
cfg.WriteToBufioWriter(w)
|
||||
}))
|
||||
@ -112,7 +154,9 @@ func (m *Manager) Set(cfg Config) error {
|
||||
m.health.SetDNSOSHealth(err)
|
||||
return err
|
||||
}
|
||||
|
||||
m.health.SetDNSOSHealth(nil)
|
||||
m.config = &cfg
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -14,7 +14,6 @@
|
||||
"net/http"
|
||||
"net/netip"
|
||||
"net/url"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
@ -212,6 +211,10 @@ type forwarder struct {
|
||||
// /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub
|
||||
// resolver lookup.
|
||||
cloudHostFallback []resolverAndDelay
|
||||
|
||||
// To be called when a SERVFAIL is returned due to missing upstream resolvers.
|
||||
// This should attempt to properly (re)set the upstream resolvers.
|
||||
missingUpstreamRecovery func()
|
||||
}
|
||||
|
||||
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
|
||||
@ -883,22 +886,10 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
metricDNSFwdErrorNoUpstream.Add(1)
|
||||
f.logf("no upstream resolvers set, returning SERVFAIL")
|
||||
|
||||
if runtime.GOOS == "darwin" || runtime.GOOS == "ios" {
|
||||
// On apple, having no upstream resolvers here is the result a race condition where
|
||||
// we've tried a reconfig after a major link change but the system has not yet set
|
||||
// the resolvers for the new link. We use SystemConfiguration to query nameservers, and
|
||||
// the timing of when that will give us the "right" answer is non-deterministic.
|
||||
//
|
||||
// This will typically happen on sleep-wake cycles with a Wifi interface where
|
||||
// it takes some random amount of time (after telling us that the interface exists)
|
||||
// for the system to configure the dns servers.
|
||||
//
|
||||
// Repolling the network monitor here is a bit odd, but if we're
|
||||
// seeing DNS queries, it's likely that the network is now fully configured, and it's
|
||||
// an ideal time to to requery for the nameservers.
|
||||
f.logf("injecting network monitor event to attempt to refresh the resolvers")
|
||||
f.netMon.InjectEvent()
|
||||
}
|
||||
// Attempt to recompile the DNS configuration
|
||||
// If we are being asked to forward queries and we have no
|
||||
// nameservers, the network is in a bad state.
|
||||
f.missingUpstreamRecovery()
|
||||
|
||||
res, err := servfailResponse(query)
|
||||
if err != nil {
|
||||
|
@ -244,6 +244,13 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k
|
||||
return r
|
||||
}
|
||||
|
||||
// Called by the forwarder on SERVFAIL due to missing upstream resolvers
|
||||
// The func passed in here should attempt to re-query for those resolvers,
|
||||
// repair, or recover
|
||||
func (r *Resolver) SetMissingUpstreamRecovery(f func()) {
|
||||
r.forwarder.missingUpstreamRecovery = f
|
||||
}
|
||||
|
||||
func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook }
|
||||
|
||||
func (r *Resolver) SetConfig(cfg Config) error {
|
||||
|
Loading…
Reference in New Issue
Block a user