net/dns, health: raise health warning for failing forwarded DNS queries (#12888)

updates tailscale/corp#21823

Misconfigured, broken, or blocked DNS will often present as
"internet is broken'" to the end user.  This  plumbs the health tracker
into the dns manager and forwarder and adds a health warning
with a 5 second delay that is raised on failures in the forwarder and
lowered on successes.

Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
Jonathan Nobels 2024-07-29 13:48:46 -04:00 committed by GitHub
parent 3088c6105e
commit 19b0c8a024
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 53 additions and 8 deletions

View File

@ -32,4 +32,8 @@
// ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state.
ArgServerName Arg = "server-name"
// ArgServerName provides a Warnable with comma delimited list of the hostname of the servers involved in the unhealthy state.
// If no nameservers were available to query, this will be an empty string.
ArgDNSServers Arg = "dns-servers"
)

View File

@ -82,7 +82,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
m := &Manager{
logf: logf,
resolver: resolver.New(logf, linkSel, dialer, knobs),
resolver: resolver.New(logf, linkSel, dialer, health, knobs),
os: oscfg,
health: health,
knobs: knobs,

View File

@ -15,6 +15,7 @@
"github.com/google/go-cmp/cmp"
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
"tailscale.com/tstest"
@ -88,7 +89,7 @@ func TestDNSOverTCP(t *testing.T) {
SearchDomains: fqdns("coffee.shop"),
},
}
m := NewManager(t.Logf, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m := NewManager(t.Logf, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m.resolver.TestOnlySetHook(f.SetResolver)
m.Set(Config{
Hosts: hosts(
@ -173,7 +174,7 @@ func TestDNSOverTCP_TooLarge(t *testing.T) {
SearchDomains: fqdns("coffee.shop"),
},
}
m := NewManager(log, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m := NewManager(log, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m.resolver.TestOnlySetHook(f.SetResolver)
m.Set(Config{
Hosts: hosts("andrew.ts.com.", "1.2.3.4"),

View File

@ -23,6 +23,7 @@
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/dns/publicdns"
"tailscale.com/net/dnscache"
"tailscale.com/net/neterror"
@ -164,6 +165,23 @@ func clampEDNSSize(packet []byte, maxSize uint16) {
binary.BigEndian.PutUint16(opt[3:5], maxSize)
}
// dnsForwarderFailing should be raised when the forwarder is unable to reach the
// upstream resolvers. This is a high severity warning as it results in "no internet".
// This warning must be cleared when the forwarder is working again.
//
// We allow for 5 second grace period to ensure this is not raised for spurious errors
// under the assumption that DNS queries are relatively frequent and a subsequent
// successful query will clear any one-off errors.
var dnsForwarderFailing = health.Register(&health.Warnable{
Code: "dns-forward-failing",
Title: "DNS unavailable",
Severity: health.SeverityHigh,
DependsOn: []*health.Warnable{health.NetworkStatusWarnable},
Text: health.StaticMessage("Tailscale can't reach the configured DNS servers. Internet connectivity may be affected."),
ImpactsConnectivity: true,
TimeToVisible: 5 * time.Second,
})
type route struct {
Suffix dnsname.FQDN
Resolvers []resolverAndDelay
@ -188,6 +206,7 @@ type forwarder struct {
netMon *netmon.Monitor // always non-nil
linkSel ForwardLinkSelector // TODO(bradfitz): remove this when tsdial.Dialer absorbs it
dialer *tsdial.Dialer
health *health.Tracker // always non-nil
controlKnobs *controlknobs.Knobs // or nil
@ -219,7 +238,7 @@ type forwarder struct {
missingUpstreamRecovery func()
}
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
if netMon == nil {
panic("nil netMon")
}
@ -228,6 +247,7 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
netMon: netMon,
linkSel: linkSel,
dialer: dialer,
health: health,
controlKnobs: knobs,
missingUpstreamRecovery: func() {},
}
@ -887,6 +907,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
resolvers = f.resolvers(domain)
if len(resolvers) == 0 {
metricDNSFwdErrorNoUpstream.Add(1)
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
f.logf("no upstream resolvers set, returning SERVFAIL")
// Attempt to recompile the DNS configuration
@ -909,6 +930,8 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case responseChan <- res:
return nil
}
} else {
f.health.SetHealthy(dnsForwarderFailing)
}
}
@ -960,6 +983,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
return fmt.Errorf("waiting to send response: %w", ctx.Err())
case responseChan <- packet{v, query.family, query.addr}:
metricDNSFwdSuccess.Add(1)
f.health.SetHealthy(dnsForwarderFailing)
return nil
}
case err := <-errc:
@ -979,6 +1003,11 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case <-ctx.Done():
metricDNSFwdErrorContext.Add(1)
metricDNSFwdErrorContextGotError.Add(1)
var resolverAddrs []string
for _, rr := range resolvers {
resolverAddrs = append(resolverAddrs, rr.name.Addr)
}
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
case responseChan <- res:
}
}
@ -999,6 +1028,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
for _, rr := range resolvers {
resolverAddrs = append(resolverAddrs, rr.name.Addr)
}
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
return fmt.Errorf("waiting for response or error from %v: %w", resolverAddrs, ctx.Err())
}
}

View File

@ -24,6 +24,7 @@
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
"tailscale.com/types/dnstype"
@ -457,7 +458,7 @@ func runTestQuery(tb testing.TB, port uint16, request []byte, modify func(*forwa
var dialer tsdial.Dialer
dialer.SetNetMon(netMon)
fwd := newForwarder(tb.Logf, netMon, nil, &dialer, nil)
fwd := newForwarder(tb.Logf, netMon, nil, &dialer, new(health.Tracker), nil)
if modify != nil {
modify(fwd)
}

View File

@ -25,6 +25,7 @@
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/dns/resolvconffile"
"tailscale.com/net/netaddr"
"tailscale.com/net/netmon"
@ -202,6 +203,7 @@ type Resolver struct {
logf logger.Logf
netMon *netmon.Monitor // non-nil
dialer *tsdial.Dialer // non-nil
health *health.Tracker // non-nil
saveConfigForTests func(cfg Config) // used in tests to capture resolver config
// forwarder forwards requests to upstream nameservers.
forwarder *forwarder
@ -224,10 +226,14 @@ type ForwardLinkSelector interface {
}
// New returns a new resolver.
func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *Resolver {
// dialer and health must be non-nil.
func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *Resolver {
if dialer == nil {
panic("nil Dialer")
}
if health == nil {
panic("nil health")
}
netMon := dialer.NetMon()
if netMon == nil {
logf("nil netMon")
@ -239,8 +245,9 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k
hostToIP: map[dnsname.FQDN][]netip.Addr{},
ipToHost: map[netip.Addr]dnsname.FQDN{},
dialer: dialer,
health: health,
}
r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, knobs)
r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, health, knobs)
return r
}

View File

@ -23,6 +23,7 @@
miekdns "github.com/miekg/dns"
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/health"
"tailscale.com/net/netaddr"
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
@ -354,6 +355,7 @@ func newResolver(t testing.TB) *Resolver {
return New(t.Logf,
nil, // no link selector
tsdial.NewDialer(netmon.NewStatic()),
new(health.Tracker),
nil, // no control knobs
)
}
@ -1068,7 +1070,7 @@ func TestForwardLinkSelection(t *testing.T) {
return "special"
}
return ""
}), new(tsdial.Dialer), nil /* no control knobs */)
}), new(tsdial.Dialer), new(health.Tracker), nil /* no control knobs */)
// Test non-special IP.
if got, err := fwd.packetListener(netip.Addr{}); err != nil {