From 9eb65601ef3a1f5e4627173340e1157bc1d64e8c Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Mon, 15 Mar 2021 22:20:48 -0700 Subject: [PATCH] health, ipn/ipnlocal: track, log overall health Updates #1505 Signed-off-by: Brad Fitzpatrick --- control/controlclient/auto.go | 7 +- health/health.go | 119 ++++++++++++++++++++++++++++------ ipn/ipnlocal/local.go | 40 ++++++++---- 3 files changed, 131 insertions(+), 35 deletions(-) diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index 191378568..dfc8df7a4 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -179,8 +179,11 @@ func NewNoStart(opts Options) (*Client, error) { } -func (c *Client) onHealthChange(key string, err error) { - c.logf("controlclient: restarting map request for %q health change to new state: %v", key, err) +func (c *Client) onHealthChange(sys health.Subsystem, err error) { + if sys == health.SysOverall { + return + } + c.logf("controlclient: restarting map request for %q health change to new state: %v", sys, err) c.cancelMapSafely() } diff --git a/health/health.go b/health/health.go index a8253f19c..81d346d15 100644 --- a/health/health.go +++ b/health/health.go @@ -7,9 +7,13 @@ package health import ( + "errors" + "fmt" + "sort" "sync" "time" + "github.com/go-multierror/multierror" "tailscale.com/tailcfg" ) @@ -17,8 +21,9 @@ // mu guards everything in this var block. mu sync.Mutex - m = map[string]error{} // error key => err (or nil for no error) - watchers = map[*watchHandle]func(string, error){} // opt func to run if error state changes + sysErr = map[Subsystem]error{} // error key => err (or nil for no error) + watchers = map[*watchHandle]func(Subsystem, error){} // opt func to run if error state changes + timer *time.Timer inMapPoll bool inMapPollSince time.Time @@ -32,49 +37,77 @@ ipnWantRunning bool ) +// Subsystem is the name of a subsystem whose health can be monitored. +type Subsystem string + +const ( + // SysOverall is the name representing the overall health of + // the system, rather than one particular subsystem. + SysOverall = Subsystem("overall") + + // SysRouter is the name the wgengine/router subsystem. + SysRouter = Subsystem("router") + + // SysNetworkCategory is the name of the subsystem that sets + // the Windows network adapter's "category" (public, private, domain). + // If it's unhealthy, the Windows firewall rules won't match. + SysNetworkCategory = Subsystem("network-category") +) + type watchHandle byte // RegisterWatcher adds a function that will be called if an // error changes state either to unhealthy or from unhealthy. It is // not called on transition from unknown to healthy. It must be non-nil // and is run in its own goroutine. The returned func unregisters it. -func RegisterWatcher(cb func(errKey string, err error)) (unregister func()) { +func RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) { mu.Lock() defer mu.Unlock() handle := new(watchHandle) watchers[handle] = cb + if timer == nil { + timer = time.AfterFunc(time.Minute, timerSelfCheck) + } return func() { mu.Lock() defer mu.Unlock() delete(watchers, handle) + if len(watchers) == 0 && timer != nil { + timer.Stop() + timer = nil + } } } // SetRouter sets the state of the wgengine/router.Router. -func SetRouterHealth(err error) { set("router", err) } +func SetRouterHealth(err error) { set(SysRouter, err) } // RouterHealth returns the wgengine/router.Router error state. -func RouterHealth() error { return get("router") } +func RouterHealth() error { return get(SysRouter) } // SetNetworkCategoryHealth sets the state of setting the network adaptor's category. // This only applies on Windows. -func SetNetworkCategoryHealth(err error) { set("network-category", err) } +func SetNetworkCategoryHealth(err error) { set(SysNetworkCategory, err) } -func NetworkCategoryHealth() error { return get("network-category") } +func NetworkCategoryHealth() error { return get(SysNetworkCategory) } -func get(key string) error { +func get(key Subsystem) error { mu.Lock() defer mu.Unlock() - return m[key] + return sysErr[key] } -func set(key string, err error) { +func set(key Subsystem, err error) { mu.Lock() defer mu.Unlock() - old, ok := m[key] + setLocked(key, err) +} + +func setLocked(key Subsystem, err error) { + old, ok := sysErr[key] if !ok && err == nil { // Initial happy path. - m[key] = nil + sysErr[key] = nil selfCheckLocked() return } @@ -83,11 +116,11 @@ func set(key string, err error) { // don't run callbacks, but exact error might've // changed, so note it. if err != nil { - m[key] = err + sysErr[key] = err } return } - m[key] = err + sysErr[key] = err selfCheckLocked() for _, cb := range watchers { go cb(key, err) @@ -162,14 +195,62 @@ func SetIPNState(state string, wantRunning bool) { selfCheckLocked() } +func timerSelfCheck() { + mu.Lock() + defer mu.Unlock() + selfCheckLocked() + if timer != nil { + timer.Reset(time.Minute) + } +} + func selfCheckLocked() { - // TODO: check states against each other. - // For staticcheck for now: + if ipnState == "" { + // Don't check yet. + return + } + setLocked(SysOverall, overallErrorLocked()) +} + +func overallErrorLocked() error { + if ipnState != "Running" || !ipnWantRunning { + return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning) + } + now := time.Now() + if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) { + return errors.New("not in map poll") + } + const tooIdle = 2*time.Minute + 5*time.Second + if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle { + return fmt.Errorf("no map response in %v", d) + } + rid := derpHomeRegion + if rid == 0 { + return errors.New("no DERP home") + } + if !derpRegionConnected[rid] { + return fmt.Errorf("not connected to home DERP region %v", rid) + } + if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { + return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d) + } + + // TODO: use _ = inMapPollSince _ = lastMapPollEndedAt _ = lastStreamedMapResponse - _ = derpHomeRegion _ = lastMapRequestHeard - _ = ipnState - _ = ipnWantRunning + + var errs []error + for sys, err := range sysErr { + if err == nil || sys == SysOverall { + continue + } + errs = append(errs, fmt.Errorf("%v: %w", sys, err)) + } + sort.Slice(errs, func(i, j int) bool { + // Not super efficient (stringifying these in a sort), but probably max 2 or 3 items. + return errs[i].Error() < errs[j].Error() + }) + return multierror.New(errs) } diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 444594271..38c9323d7 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -65,20 +65,21 @@ func getControlDebugFlags() []string { // state machine generates events back out to zero or more components. type LocalBackend struct { // Elements that are thread-safe or constant after construction. - ctx context.Context // canceled by Close - ctxCancel context.CancelFunc // cancels ctx - logf logger.Logf // general logging - keyLogf logger.Logf // for printing list of peers on change - statsLogf logger.Logf // for printing peers stats on change - e wgengine.Engine - store ipn.StateStore - backendLogID string - unregisterLinkMon func() - portpoll *portlist.Poller // may be nil - portpollOnce sync.Once // guards starting readPoller - gotPortPollRes chan struct{} // closed upon first readPoller result - serverURL string // tailcontrol URL - newDecompressor func() (controlclient.Decompressor, error) + ctx context.Context // canceled by Close + ctxCancel context.CancelFunc // cancels ctx + logf logger.Logf // general logging + keyLogf logger.Logf // for printing list of peers on change + statsLogf logger.Logf // for printing peers stats on change + e wgengine.Engine + store ipn.StateStore + backendLogID string + unregisterLinkMon func() + unregisterHealthWatch func() + portpoll *portlist.Poller // may be nil + portpollOnce sync.Once // guards starting readPoller + gotPortPollRes chan struct{} // closed upon first readPoller result + serverURL string // tailcontrol URL + newDecompressor func() (controlclient.Decompressor, error) filterHash string @@ -148,6 +149,8 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, e wge b.linkChange(false, linkMon.InterfaceState()) b.unregisterLinkMon = linkMon.RegisterChangeCallback(b.linkChange) + b.unregisterHealthWatch = health.RegisterWatcher(b.onHealthChange) + return b, nil } @@ -182,6 +185,14 @@ func (b *LocalBackend) linkChange(major bool, ifst *interfaces.State) { b.updateFilter(b.netMap, b.prefs) } +func (b *LocalBackend) onHealthChange(sys health.Subsystem, err error) { + if err == nil { + b.logf("health(%q): ok", sys) + } else { + b.logf("health(%q): error: %v", sys, err) + } +} + // Shutdown halts the backend and all its sub-components. The backend // can no longer be used after Shutdown returns. func (b *LocalBackend) Shutdown() { @@ -190,6 +201,7 @@ func (b *LocalBackend) Shutdown() { b.mu.Unlock() b.unregisterLinkMon() + b.unregisterHealthWatch() if cli != nil { cli.Shutdown() }