tailscale/health/health.go
Brad Fitzpatrick aae622314e tailcfg, health: add way for control plane to add problems to health check
So if the control plane knows that something's broken about the node, it can
include problem(s) in MapResponse and "tailscale status" will show it.
(and GUIs in the future, as it's in ipnstate.Status/JSON)

This also bumps the MapRequest.Version, though it's not strictly
required. Doesn't hurt.

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2021-09-19 17:55:49 -07:00

396 lines
10 KiB
Go

// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package health is a registry for other packages to report & check
// overall health status of the node.
package health
import (
"errors"
"fmt"
"os"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/go-multierror/multierror"
"tailscale.com/tailcfg"
)
var (
// mu guards everything in this var block.
mu sync.Mutex
sysErr = map[Subsystem]error{} // error key => err (or nil for no error)
watchers = map[*watchHandle]func(Subsystem, error){} // opt func to run if error state changes
timer *time.Timer
inMapPoll bool
inMapPollSince time.Time
lastMapPollEndedAt time.Time
lastStreamedMapResponse time.Time
derpHomeRegion int
derpRegionConnected = map[int]bool{}
derpRegionHealthProblem = map[int]string{}
derpRegionLastFrame = map[int]time.Time{}
lastMapRequestHeard time.Time // time we got a 200 from control for a MapRequest
ipnState string
ipnWantRunning bool
anyInterfaceUp = true // until told otherwise
udp4Unbound bool
controlHealth []string
)
// Subsystem is the name of a subsystem whose health can be monitored.
type Subsystem string
const (
// SysOverall is the name representing the overall health of
// the system, rather than one particular subsystem.
SysOverall = Subsystem("overall")
// SysRouter is the name of the wgengine/router subsystem.
SysRouter = Subsystem("router")
// SysDNS is the name of the net/dns subsystem.
SysDNS = Subsystem("dns")
// SysNetworkCategory is the name of the subsystem that sets
// the Windows network adapter's "category" (public, private, domain).
// If it's unhealthy, the Windows firewall rules won't match.
SysNetworkCategory = Subsystem("network-category")
)
type watchHandle byte
// RegisterWatcher adds a function that will be called if an
// error changes state either to unhealthy or from unhealthy. It is
// not called on transition from unknown to healthy. It must be non-nil
// and is run in its own goroutine. The returned func unregisters it.
func RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) {
mu.Lock()
defer mu.Unlock()
handle := new(watchHandle)
watchers[handle] = cb
if timer == nil {
timer = time.AfterFunc(time.Minute, timerSelfCheck)
}
return func() {
mu.Lock()
defer mu.Unlock()
delete(watchers, handle)
if len(watchers) == 0 && timer != nil {
timer.Stop()
timer = nil
}
}
}
// SetRouterHealth sets the state of the wgengine/router.Router.
func SetRouterHealth(err error) { set(SysRouter, err) }
// RouterHealth returns the wgengine/router.Router error state.
func RouterHealth() error { return get(SysRouter) }
// SetDNSHealth sets the state of the net/dns.Manager
func SetDNSHealth(err error) { set(SysDNS, err) }
// DNSHealth returns the net/dns.Manager error state.
func DNSHealth() error { return get(SysDNS) }
// SetNetworkCategoryHealth sets the state of setting the network adaptor's category.
// This only applies on Windows.
func SetNetworkCategoryHealth(err error) { set(SysNetworkCategory, err) }
func NetworkCategoryHealth() error { return get(SysNetworkCategory) }
func get(key Subsystem) error {
mu.Lock()
defer mu.Unlock()
return sysErr[key]
}
func set(key Subsystem, err error) {
mu.Lock()
defer mu.Unlock()
setLocked(key, err)
}
func setLocked(key Subsystem, err error) {
old, ok := sysErr[key]
if !ok && err == nil {
// Initial happy path.
sysErr[key] = nil
selfCheckLocked()
return
}
if ok && (old == nil) == (err == nil) {
// No change in overall error status (nil-vs-not), so
// don't run callbacks, but exact error might've
// changed, so note it.
if err != nil {
sysErr[key] = err
}
return
}
sysErr[key] = err
selfCheckLocked()
for _, cb := range watchers {
go cb(key, err)
}
}
func SetControlHealth(problems []string) {
mu.Lock()
defer mu.Unlock()
controlHealth = problems
selfCheckLocked()
}
// GotStreamedMapResponse notes that we got a tailcfg.MapResponse
// message in streaming mode, even if it's just a keep-alive message.
func GotStreamedMapResponse() {
mu.Lock()
defer mu.Unlock()
lastStreamedMapResponse = time.Now()
selfCheckLocked()
}
// SetInPollNetMap records that we're in
func SetInPollNetMap(v bool) {
mu.Lock()
defer mu.Unlock()
if v == inMapPoll {
return
}
inMapPoll = v
if v {
inMapPollSince = time.Now()
} else {
lastMapPollEndedAt = time.Now()
}
}
// SetMagicSockDERPHome notes what magicsock's view of its home DERP is.
func SetMagicSockDERPHome(region int) {
mu.Lock()
defer mu.Unlock()
derpHomeRegion = region
selfCheckLocked()
}
// NoteMapRequestHeard notes whenever we successfully sent a map request
// to control for which we received a 200 response.
func NoteMapRequestHeard(mr *tailcfg.MapRequest) {
mu.Lock()
defer mu.Unlock()
// TODO: extract mr.HostInfo.NetInfo.PreferredDERP, compare
// against SetMagicSockDERPHome and
// SetDERPRegionConnectedState
lastMapRequestHeard = time.Now()
selfCheckLocked()
}
func SetDERPRegionConnectedState(region int, connected bool) {
mu.Lock()
defer mu.Unlock()
derpRegionConnected[region] = connected
selfCheckLocked()
}
// SetDERPRegionHealth sets or clears any problem associated with the
// provided DERP region.
func SetDERPRegionHealth(region int, problem string) {
mu.Lock()
defer mu.Unlock()
if problem == "" {
delete(derpRegionHealthProblem, region)
} else {
derpRegionHealthProblem[region] = problem
}
selfCheckLocked()
}
func NoteDERPRegionReceivedFrame(region int) {
mu.Lock()
defer mu.Unlock()
derpRegionLastFrame[region] = time.Now()
selfCheckLocked()
}
// state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc.
func SetIPNState(state string, wantRunning bool) {
mu.Lock()
defer mu.Unlock()
ipnState = state
ipnWantRunning = wantRunning
selfCheckLocked()
}
// SetAnyInterfaceUp sets whether any network interface is up.
func SetAnyInterfaceUp(up bool) {
mu.Lock()
defer mu.Unlock()
anyInterfaceUp = up
selfCheckLocked()
}
// SetUDP4Unbound sets whether the udp4 bind failed completely.
func SetUDP4Unbound(unbound bool) {
mu.Lock()
defer mu.Unlock()
udp4Unbound = unbound
selfCheckLocked()
}
func timerSelfCheck() {
mu.Lock()
defer mu.Unlock()
checkReceiveFuncs()
selfCheckLocked()
if timer != nil {
timer.Reset(time.Minute)
}
}
func selfCheckLocked() {
if ipnState == "" {
// Don't check yet.
return
}
setLocked(SysOverall, overallErrorLocked())
}
// OverallError returns a summary of the health state.
//
// If there are multiple problems, the error will be of type
// multierror.MultipleErrors.
func OverallError() error {
mu.Lock()
defer mu.Unlock()
return overallErrorLocked()
}
var fakeErrForTesting = os.Getenv("TS_DEBUG_FAKE_HEALTH_ERROR")
func overallErrorLocked() error {
if !anyInterfaceUp {
return errors.New("network down")
}
if ipnState != "Running" || !ipnWantRunning {
return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning)
}
now := time.Now()
if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) {
return errors.New("not in map poll")
}
const tooIdle = 2*time.Minute + 5*time.Second
if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle {
return fmt.Errorf("no map response in %v", d)
}
rid := derpHomeRegion
if rid == 0 {
return errors.New("no DERP home")
}
if !derpRegionConnected[rid] {
return fmt.Errorf("not connected to home DERP region %v", rid)
}
if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d)
}
if udp4Unbound {
return errors.New("no udp4 bind")
}
// TODO: use
_ = inMapPollSince
_ = lastMapPollEndedAt
_ = lastStreamedMapResponse
_ = lastMapRequestHeard
var errs []error
for _, recv := range receiveFuncs {
if recv.missing {
errs = append(errs, fmt.Errorf("%s is not running", recv.name))
}
}
for sys, err := range sysErr {
if err == nil || sys == SysOverall {
continue
}
errs = append(errs, fmt.Errorf("%v: %w", sys, err))
}
for regionID, problem := range derpRegionHealthProblem {
errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem))
}
for _, s := range controlHealth {
errs = append(errs, errors.New(s))
}
if e := fakeErrForTesting; len(errs) == 0 && e != "" {
return errors.New(e)
}
sort.Slice(errs, func(i, j int) bool {
// Not super efficient (stringifying these in a sort), but probably max 2 or 3 items.
return errs[i].Error() < errs[j].Error()
})
return multierror.New(errs)
}
var (
ReceiveIPv4 = ReceiveFuncStats{name: "ReceiveIPv4"}
ReceiveIPv6 = ReceiveFuncStats{name: "ReceiveIPv6"}
ReceiveDERP = ReceiveFuncStats{name: "ReceiveDERP"}
receiveFuncs = []*ReceiveFuncStats{&ReceiveIPv4, &ReceiveIPv6, &ReceiveDERP}
)
// ReceiveFuncStats tracks the calls made to a wireguard-go receive func.
type ReceiveFuncStats struct {
// name is the name of the receive func.
name string
// numCalls is the number of times the receive func has ever been called.
// It is required because it is possible for a receive func's wireguard-go goroutine
// to be active even though the receive func isn't.
// The wireguard-go goroutine alternates between calling the receive func and
// processing what the func returned.
numCalls uint64 // accessed atomically
// prevNumCalls is the value of numCalls last time the health check examined it.
prevNumCalls uint64
// inCall indicates whether the receive func is currently running.
inCall uint32 // bool, accessed atomically
// missing indicates whether the receive func is not running.
missing bool
}
func (s *ReceiveFuncStats) Enter() {
atomic.AddUint64(&s.numCalls, 1)
atomic.StoreUint32(&s.inCall, 1)
}
func (s *ReceiveFuncStats) Exit() {
atomic.StoreUint32(&s.inCall, 0)
}
func checkReceiveFuncs() {
for _, recv := range receiveFuncs {
recv.missing = false
prev := recv.prevNumCalls
numCalls := atomic.LoadUint64(&recv.numCalls)
recv.prevNumCalls = numCalls
if numCalls > prev {
// OK: the function has gotten called since last we checked
continue
}
if atomic.LoadUint32(&recv.inCall) == 1 {
// OK: the function is active, probably blocked due to inactivity
continue
}
// Not OK: The function is not active, and not accumulating new calls.
// It is probably MIA.
recv.missing = true
}
}