tailscale/health/health.go
Josh Bleecher Snyder 8d7f7fc7ce health, wgenegine: fix receive func health checks yet again
The existing implementation was completely, embarrassingly conceptually broken.

We aren't able to see whether wireguard-go's receive function goroutines
are running or not. All we can do is model that based on what we have done.
This commit fixes that model.

Fixes #1781

Signed-off-by: Josh Bleecher Snyder <josharian@gmail.com>
2021-04-23 08:42:04 -07:00

349 lines
9.6 KiB
Go

// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package health is a registry for other packages to report & check
// overall health status of the node.
package health
import (
"errors"
"fmt"
"sort"
"sync"
"time"
"github.com/go-multierror/multierror"
"tailscale.com/tailcfg"
)
var (
// mu guards everything in this var block.
mu sync.Mutex
sysErr = map[Subsystem]error{} // error key => err (or nil for no error)
watchers = map[*watchHandle]func(Subsystem, error){} // opt func to run if error state changes
timer *time.Timer
inMapPoll bool
inMapPollSince time.Time
lastMapPollEndedAt time.Time
lastStreamedMapResponse time.Time
derpHomeRegion int
derpRegionConnected = map[int]bool{}
derpRegionLastFrame = map[int]time.Time{}
lastMapRequestHeard time.Time // time we got a 200 from control for a MapRequest
ipnState string
ipnWantRunning bool
anyInterfaceUp = true // until told otherwise
ReceiveIPv4 = ReceiveFuncState{name: "IPv4"}
ReceiveIPv6 = ReceiveFuncState{name: "IPv6"}
ReceiveDERP = ReceiveFuncState{name: "DERP"}
)
// Subsystem is the name of a subsystem whose health can be monitored.
type Subsystem string
const (
// SysOverall is the name representing the overall health of
// the system, rather than one particular subsystem.
SysOverall = Subsystem("overall")
// SysRouter is the name the wgengine/router subsystem.
SysRouter = Subsystem("router")
// SysDNS is the name of the net/dns subsystem.
SysDNS = Subsystem("dns")
// SysNetworkCategory is the name of the subsystem that sets
// the Windows network adapter's "category" (public, private, domain).
// If it's unhealthy, the Windows firewall rules won't match.
SysNetworkCategory = Subsystem("network-category")
)
type watchHandle byte
// RegisterWatcher adds a function that will be called if an
// error changes state either to unhealthy or from unhealthy. It is
// not called on transition from unknown to healthy. It must be non-nil
// and is run in its own goroutine. The returned func unregisters it.
func RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) {
mu.Lock()
defer mu.Unlock()
handle := new(watchHandle)
watchers[handle] = cb
if timer == nil {
timer = time.AfterFunc(time.Minute, timerSelfCheck)
}
return func() {
mu.Lock()
defer mu.Unlock()
delete(watchers, handle)
if len(watchers) == 0 && timer != nil {
timer.Stop()
timer = nil
}
}
}
// SetRouterHealth sets the state of the wgengine/router.Router.
func SetRouterHealth(err error) { set(SysRouter, err) }
// RouterHealth returns the wgengine/router.Router error state.
func RouterHealth() error { return get(SysRouter) }
// SetDNSHealth sets the state of the net/dns.Manager
func SetDNSHealth(err error) { set(SysDNS, err) }
// DNSHealth returns the net/dns.Manager error state.
func DNSHealth() error { return get(SysDNS) }
// SetNetworkCategoryHealth sets the state of setting the network adaptor's category.
// This only applies on Windows.
func SetNetworkCategoryHealth(err error) { set(SysNetworkCategory, err) }
func NetworkCategoryHealth() error { return get(SysNetworkCategory) }
func get(key Subsystem) error {
mu.Lock()
defer mu.Unlock()
return sysErr[key]
}
func set(key Subsystem, err error) {
mu.Lock()
defer mu.Unlock()
setLocked(key, err)
}
func setLocked(key Subsystem, err error) {
old, ok := sysErr[key]
if !ok && err == nil {
// Initial happy path.
sysErr[key] = nil
selfCheckLocked()
return
}
if ok && (old == nil) == (err == nil) {
// No change in overall error status (nil-vs-not), so
// don't run callbacks, but exact error might've
// changed, so note it.
if err != nil {
sysErr[key] = err
}
return
}
sysErr[key] = err
selfCheckLocked()
for _, cb := range watchers {
go cb(key, err)
}
}
// GotStreamedMapResponse notes that we got a tailcfg.MapResponse
// message in streaming mode, even if it's just a keep-alive message.
func GotStreamedMapResponse() {
mu.Lock()
defer mu.Unlock()
lastStreamedMapResponse = time.Now()
selfCheckLocked()
}
// SetInPollNetMap records that we're in
func SetInPollNetMap(v bool) {
mu.Lock()
defer mu.Unlock()
if v == inMapPoll {
return
}
inMapPoll = v
if v {
inMapPollSince = time.Now()
} else {
lastMapPollEndedAt = time.Now()
}
}
// SetMagicSockDERPHome notes what magicsock's view of its home DERP is.
func SetMagicSockDERPHome(region int) {
mu.Lock()
defer mu.Unlock()
derpHomeRegion = region
selfCheckLocked()
}
// NoteMapRequestHeard notes whenever we successfully sent a map request
// to control for which we received a 200 response.
func NoteMapRequestHeard(mr *tailcfg.MapRequest) {
mu.Lock()
defer mu.Unlock()
// TODO: extract mr.HostInfo.NetInfo.PreferredDERP, compare
// against SetMagicSockDERPHome and
// SetDERPRegionConnectedState
lastMapRequestHeard = time.Now()
selfCheckLocked()
}
func SetDERPRegionConnectedState(region int, connected bool) {
mu.Lock()
defer mu.Unlock()
derpRegionConnected[region] = connected
selfCheckLocked()
}
func NoteDERPRegionReceivedFrame(region int) {
mu.Lock()
defer mu.Unlock()
derpRegionLastFrame[region] = time.Now()
selfCheckLocked()
}
// state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc.
func SetIPNState(state string, wantRunning bool) {
mu.Lock()
defer mu.Unlock()
ipnState = state
ipnWantRunning = wantRunning
selfCheckLocked()
}
// SetAnyInterfaceUp sets whether any network interface is up.
func SetAnyInterfaceUp(up bool) {
mu.Lock()
defer mu.Unlock()
anyInterfaceUp = up
selfCheckLocked()
}
// ReceiveFuncState tracks the state of a wireguard-go conn.ReceiveFunc.
type ReceiveFuncState struct {
// name is a mnemonic for the receive func, used in error messages.
name string
// started indicates whether magicsock.connBind.Open
// has requested that wireguard-go start its receive func
// goroutine (without a corresponding connBind.Close).
started bool
// running models whether wireguard-go's receive func
// goroutine is actually running. We cannot easily introspect that,
// so it is based on our knowledge of wireguard-go's internals.
running bool
}
// err returns the error state (if any) that s represents.
func (s ReceiveFuncState) err() error {
// Possible states:
// | started | running | notes
// | ------- | ------- | -----
// | true | true | normal operation
// | true | false | we prematurely returned a permanent error from this receive func
// | false | true | we have told package health that we're closing the bind, but the receive funcs haven't closed yet (transient)
// | false | false | not running
// The problematic case is started && !running.
// If that happens, wireguard-go will no longer request packets,
// and we'll lose an entire communication channel.
if s.started && !s.running {
return fmt.Errorf("receive%s started but not running", s.name)
}
return nil
}
// Open tells r that connBind.Open has requested wireguard-go open a conn.Bind that includes r.
func (r *ReceiveFuncState) Open() {
mu.Lock()
defer mu.Unlock()
r.started = true
r.running = true
selfCheckLocked()
}
// Stop tells r that we have returned a permanent error to wireguard-go.
// wireguard-go's receive func goroutine for r will soon stop.
func (r *ReceiveFuncState) Stop() {
mu.Lock()
defer mu.Unlock()
r.running = false
selfCheckLocked()
}
// Close tells r that connBind.Close has requested wireguard-go close the bind for r.
// This will stop the corresponding receive func goroutine.
// Close must be called before actually closing the underlying connection,
// to avoid a small window of false positives.
func (r *ReceiveFuncState) Close() {
mu.Lock()
defer mu.Unlock()
r.started = false
selfCheckLocked()
}
func timerSelfCheck() {
mu.Lock()
defer mu.Unlock()
selfCheckLocked()
if timer != nil {
timer.Reset(time.Minute)
}
}
func selfCheckLocked() {
if ipnState == "" {
// Don't check yet.
return
}
setLocked(SysOverall, overallErrorLocked())
}
func overallErrorLocked() error {
if !anyInterfaceUp {
return errors.New("network down")
}
if ipnState != "Running" || !ipnWantRunning {
return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning)
}
now := time.Now()
if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) {
return errors.New("not in map poll")
}
const tooIdle = 2*time.Minute + 5*time.Second
if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle {
return fmt.Errorf("no map response in %v", d)
}
rid := derpHomeRegion
if rid == 0 {
return errors.New("no DERP home")
}
if !derpRegionConnected[rid] {
return fmt.Errorf("not connected to home DERP region %v", rid)
}
if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d)
}
// TODO: use
_ = inMapPollSince
_ = lastMapPollEndedAt
_ = lastStreamedMapResponse
_ = lastMapRequestHeard
var errs []error
for _, recv := range []ReceiveFuncState{ReceiveIPv4, ReceiveIPv6, ReceiveDERP} {
if err := recv.err(); err != nil {
errs = append(errs, err)
}
}
for sys, err := range sysErr {
if err == nil || sys == SysOverall {
continue
}
errs = append(errs, fmt.Errorf("%v: %w", sys, err))
}
sort.Slice(errs, func(i, j int) bool {
// Not super efficient (stringifying these in a sort), but probably max 2 or 3 items.
return errs[i].Error() < errs[j].Error()
})
return multierror.New(errs)
}