net/dns, health: raise health warning for failing forwarded DNS queries ()

updates 

Misconfigured, broken, or blocked DNS will often present as
"internet is broken'" to the end user.  This  plumbs the health tracker
into the dns manager and forwarder and adds a health warning
with a 5 second delay that is raised on failures in the forwarder and
lowered on successes.

Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
Jonathan Nobels 2024-07-29 13:48:46 -04:00 committed by GitHub
parent 3088c6105e
commit 19b0c8a024
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 53 additions and 8 deletions

@ -32,4 +32,8 @@ const (
// ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state.
ArgServerName Arg = "server-name"
// ArgServerName provides a Warnable with comma delimited list of the hostname of the servers involved in the unhealthy state.
// If no nameservers were available to query, this will be an empty string.
ArgDNSServers Arg = "dns-servers"
)

@ -82,7 +82,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
m := &Manager{
logf: logf,
resolver: resolver.New(logf, linkSel, dialer, knobs),
resolver: resolver.New(logf, linkSel, dialer, health, knobs),
os: oscfg,
health: health,
knobs: knobs,

@ -15,6 +15,7 @@ import (
"github.com/google/go-cmp/cmp"
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
"tailscale.com/tstest"
@ -88,7 +89,7 @@ func TestDNSOverTCP(t *testing.T) {
SearchDomains: fqdns("coffee.shop"),
},
}
m := NewManager(t.Logf, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m := NewManager(t.Logf, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m.resolver.TestOnlySetHook(f.SetResolver)
m.Set(Config{
Hosts: hosts(
@ -173,7 +174,7 @@ func TestDNSOverTCP_TooLarge(t *testing.T) {
SearchDomains: fqdns("coffee.shop"),
},
}
m := NewManager(log, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m := NewManager(log, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
m.resolver.TestOnlySetHook(f.SetResolver)
m.Set(Config{
Hosts: hosts("andrew.ts.com.", "1.2.3.4"),

@ -23,6 +23,7 @@ import (
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/dns/publicdns"
"tailscale.com/net/dnscache"
"tailscale.com/net/neterror"
@ -164,6 +165,23 @@ func clampEDNSSize(packet []byte, maxSize uint16) {
binary.BigEndian.PutUint16(opt[3:5], maxSize)
}
// dnsForwarderFailing should be raised when the forwarder is unable to reach the
// upstream resolvers. This is a high severity warning as it results in "no internet".
// This warning must be cleared when the forwarder is working again.
//
// We allow for 5 second grace period to ensure this is not raised for spurious errors
// under the assumption that DNS queries are relatively frequent and a subsequent
// successful query will clear any one-off errors.
var dnsForwarderFailing = health.Register(&health.Warnable{
Code: "dns-forward-failing",
Title: "DNS unavailable",
Severity: health.SeverityHigh,
DependsOn: []*health.Warnable{health.NetworkStatusWarnable},
Text: health.StaticMessage("Tailscale can't reach the configured DNS servers. Internet connectivity may be affected."),
ImpactsConnectivity: true,
TimeToVisible: 5 * time.Second,
})
type route struct {
Suffix dnsname.FQDN
Resolvers []resolverAndDelay
@ -188,6 +206,7 @@ type forwarder struct {
netMon *netmon.Monitor // always non-nil
linkSel ForwardLinkSelector // TODO(bradfitz): remove this when tsdial.Dialer absorbs it
dialer *tsdial.Dialer
health *health.Tracker // always non-nil
controlKnobs *controlknobs.Knobs // or nil
@ -219,7 +238,7 @@ type forwarder struct {
missingUpstreamRecovery func()
}
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
if netMon == nil {
panic("nil netMon")
}
@ -228,6 +247,7 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
netMon: netMon,
linkSel: linkSel,
dialer: dialer,
health: health,
controlKnobs: knobs,
missingUpstreamRecovery: func() {},
}
@ -887,6 +907,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
resolvers = f.resolvers(domain)
if len(resolvers) == 0 {
metricDNSFwdErrorNoUpstream.Add(1)
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
f.logf("no upstream resolvers set, returning SERVFAIL")
// Attempt to recompile the DNS configuration
@ -909,6 +930,8 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case responseChan <- res:
return nil
}
} else {
f.health.SetHealthy(dnsForwarderFailing)
}
}
@ -960,6 +983,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
return fmt.Errorf("waiting to send response: %w", ctx.Err())
case responseChan <- packet{v, query.family, query.addr}:
metricDNSFwdSuccess.Add(1)
f.health.SetHealthy(dnsForwarderFailing)
return nil
}
case err := <-errc:
@ -979,6 +1003,11 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case <-ctx.Done():
metricDNSFwdErrorContext.Add(1)
metricDNSFwdErrorContextGotError.Add(1)
var resolverAddrs []string
for _, rr := range resolvers {
resolverAddrs = append(resolverAddrs, rr.name.Addr)
}
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
case responseChan <- res:
}
}
@ -999,6 +1028,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
for _, rr := range resolvers {
resolverAddrs = append(resolverAddrs, rr.name.Addr)
}
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
return fmt.Errorf("waiting for response or error from %v: %w", resolverAddrs, ctx.Err())
}
}

@ -24,6 +24,7 @@ import (
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
"tailscale.com/types/dnstype"
@ -457,7 +458,7 @@ func runTestQuery(tb testing.TB, port uint16, request []byte, modify func(*forwa
var dialer tsdial.Dialer
dialer.SetNetMon(netMon)
fwd := newForwarder(tb.Logf, netMon, nil, &dialer, nil)
fwd := newForwarder(tb.Logf, netMon, nil, &dialer, new(health.Tracker), nil)
if modify != nil {
modify(fwd)
}

@ -25,6 +25,7 @@ import (
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/dns/resolvconffile"
"tailscale.com/net/netaddr"
"tailscale.com/net/netmon"
@ -202,6 +203,7 @@ type Resolver struct {
logf logger.Logf
netMon *netmon.Monitor // non-nil
dialer *tsdial.Dialer // non-nil
health *health.Tracker // non-nil
saveConfigForTests func(cfg Config) // used in tests to capture resolver config
// forwarder forwards requests to upstream nameservers.
forwarder *forwarder
@ -224,10 +226,14 @@ type ForwardLinkSelector interface {
}
// New returns a new resolver.
func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *Resolver {
// dialer and health must be non-nil.
func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *Resolver {
if dialer == nil {
panic("nil Dialer")
}
if health == nil {
panic("nil health")
}
netMon := dialer.NetMon()
if netMon == nil {
logf("nil netMon")
@ -239,8 +245,9 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k
hostToIP: map[dnsname.FQDN][]netip.Addr{},
ipToHost: map[netip.Addr]dnsname.FQDN{},
dialer: dialer,
health: health,
}
r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, knobs)
r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, health, knobs)
return r
}

@ -23,6 +23,7 @@ import (
miekdns "github.com/miekg/dns"
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/health"
"tailscale.com/net/netaddr"
"tailscale.com/net/netmon"
"tailscale.com/net/tsdial"
@ -354,6 +355,7 @@ func newResolver(t testing.TB) *Resolver {
return New(t.Logf,
nil, // no link selector
tsdial.NewDialer(netmon.NewStatic()),
new(health.Tracker),
nil, // no control knobs
)
}
@ -1068,7 +1070,7 @@ func TestForwardLinkSelection(t *testing.T) {
return "special"
}
return ""
}), new(tsdial.Dialer), nil /* no control knobs */)
}), new(tsdial.Dialer), new(health.Tracker), nil /* no control knobs */)
// Test non-special IP.
if got, err := fwd.packetListener(netip.Addr{}); err != nil {