From aae622314e5d7e31261a9d343623b89e70b73fb2 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Sat, 18 Sep 2021 12:59:55 -0700 Subject: [PATCH] tailcfg, health: add way for control plane to add problems to health check So if the control plane knows that something's broken about the node, it can include problem(s) in MapResponse and "tailscale status" will show it. (and GUIs in the future, as it's in ipnstate.Status/JSON) This also bumps the MapRequest.Version, though it's not strictly required. Doesn't hurt. Signed-off-by: Brad Fitzpatrick --- control/controlclient/map.go | 5 +++++ health/health.go | 11 +++++++++++ ipn/ipnlocal/local.go | 6 ++++++ tailcfg/tailcfg.go | 11 ++++++++++- types/netmap/netmap.go | 7 +++++++ 5 files changed, 39 insertions(+), 1 deletion(-) diff --git a/control/controlclient/map.go b/control/controlclient/map.go index 40d3109e3..cdeca1248 100644 --- a/control/controlclient/map.go +++ b/control/controlclient/map.go @@ -44,6 +44,7 @@ type mapSession struct { collectServices bool previousPeers []*tailcfg.Node // for delta-purposes lastDomain string + lastHealth []string // netMapBuilding is non-nil during a netmapForResponse call, // containing the value to be returned, once fully populated. @@ -105,6 +106,9 @@ func (ms *mapSession) netmapForResponse(resp *tailcfg.MapResponse) *netmap.Netwo if resp.Domain != "" { ms.lastDomain = resp.Domain } + if resp.Health != nil { + ms.lastHealth = resp.Health + } nm := &netmap.NetworkMap{ NodeKey: tailcfg.NodeKey(ms.privateNodeKey.Public()), @@ -118,6 +122,7 @@ func (ms *mapSession) netmapForResponse(resp *tailcfg.MapResponse) *netmap.Netwo CollectServices: ms.collectServices, DERPMap: ms.lastDERPMap, Debug: resp.Debug, + ControlHealth: ms.lastHealth, } ms.netMapBuilding = nm diff --git a/health/health.go b/health/health.go index 0dade963f..314107cfa 100644 --- a/health/health.go +++ b/health/health.go @@ -40,6 +40,7 @@ ipnWantRunning bool anyInterfaceUp = true // until told otherwise udp4Unbound bool + controlHealth []string ) // Subsystem is the name of a subsystem whose health can be monitored. @@ -141,6 +142,13 @@ func setLocked(key Subsystem, err error) { } } +func SetControlHealth(problems []string) { + mu.Lock() + defer mu.Unlock() + controlHealth = problems + selfCheckLocked() +} + // GotStreamedMapResponse notes that we got a tailcfg.MapResponse // message in streaming mode, even if it's just a keep-alive message. func GotStreamedMapResponse() { @@ -318,6 +326,9 @@ func overallErrorLocked() error { for regionID, problem := range derpRegionHealthProblem { errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem)) } + for _, s := range controlHealth { + errs = append(errs, errors.New(s)) + } if e := fakeErrForTesting; len(errs) == 0 && e != "" { return errors.New(e) } diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 93667d283..3e7dd33c3 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -2548,6 +2548,12 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { } b.maybePauseControlClientLocked() + if nm != nil { + health.SetControlHealth(nm.ControlHealth) + } else { + health.SetControlHealth(nil) + } + // Determine if file sharing is enabled fs := hasCapability(nm, tailcfg.CapabilityFileSharing) if fs != b.capFileSharing { diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index bef39bb2b..53ba9b7a2 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -47,7 +47,8 @@ // 21: 2021-06-15: added MapResponse.DNSConfig.CertDomains // 22: 2021-06-16: added MapResponse.DNSConfig.ExtraRecords // 23: 2021-08-25: DNSConfig.Routes values may be empty (for ExtraRecords support in 1.14.1+) -const CurrentMapRequestVersion = 23 +// 24: 2021-09-18: MapResponse.Health from control to node; node shows in "tailscale status" +const CurrentMapRequestVersion = 24 type StableID string @@ -1028,6 +1029,14 @@ type MapResponse struct { // user profiles only. UserProfiles []UserProfile `json:",omitempty"` + // Health, if non-nil, sets the health state + // of the node from the control plane's perspective. + // A nil value means no change from the previous MapResponse. + // A non-nil 0-length slice restores the health to good (no known problems). + // A non-zero length slice are the list of problems that the control place + // sees. + Health []string `json:",omitempty"` + // Debug is normally nil, except for when the control server // is setting debug settings on a node. Debug *Debug `json:",omitempty"` diff --git a/types/netmap/netmap.go b/types/netmap/netmap.go index 556ed5d06..5eec06c6f 100644 --- a/types/netmap/netmap.go +++ b/types/netmap/netmap.go @@ -54,6 +54,13 @@ type NetworkMap struct { // Debug knobs from control server for debug or feature gating. Debug *tailcfg.Debug + // ControlHealth are the list of health check problems for this + // node from the perspective of the control plane. + // If empty, there are no known problems from the control plane's + // point of view, but the node might know about its own health + // check problems. + ControlHealth []string + // ACLs User tailcfg.UserID