tailscale/health/health.go
Andrea Gottardo a8ee83e2c5
health: begin work to use structured health warnings instead of strings, pipe changes into ipn.Notify (#12406)
Updates tailscale/tailscale#4136

This PR is the first round of work to move from encoding health warnings as strings and use structured data instead. The current health package revolves around the idea of Subsystems. Each subsystem can have (or not have) a Go error associated with it. The overall health of the backend is given by the concatenation of all these errors.

This PR polishes the concept of Warnable introduced by @bradfitz a few weeks ago. Each Warnable is a component of the backend (for instance, things like 'dns' or 'magicsock' are Warnables). Each Warnable has a unique identifying code. A Warnable is an entity we can warn the user about, by setting (or unsetting) a WarningState for it. Warnables have:

- an identifying Code, so that the GUI can track them as their WarningStates come and go
- a Title, which the GUIs can use to tell the user what component of the backend is broken
- a Text, which is a function that is called with a set of Args to generate a more detailed error message to explain the unhappy state

Additionally, this PR also begins to send Warnables and their WarningStates through LocalAPI to the clients, using ipn.Notify messages. An ipn.Notify is only issued when a warning is added or removed from the Tracker.

In a next PR, we'll get rid of subsystems entirely, and we'll start using structured warnings for all errors affecting the backend functionality.

Signed-off-by: Andrea Gottardo <andrea@gottardo.me>
2024-06-14 11:53:56 -07:00

1074 lines
30 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Package health is a registry for other packages to report & check
// overall health status of the node.
package health
import (
"errors"
"fmt"
"maps"
"net/http"
"os"
"runtime"
"sort"
"sync"
"sync/atomic"
"time"
"tailscale.com/envknob"
"tailscale.com/tailcfg"
"tailscale.com/types/opt"
"tailscale.com/util/cibuild"
"tailscale.com/util/mak"
"tailscale.com/util/multierr"
"tailscale.com/util/set"
"tailscale.com/version"
)
var (
mu sync.Mutex
debugHandler map[string]http.Handler
)
// ReceiveFunc is one of the three magicsock Receive funcs (IPv4, IPv6, or
// DERP).
type ReceiveFunc int
// ReceiveFunc indices for Tracker.MagicSockReceiveFuncs.
const (
ReceiveIPv4 ReceiveFunc = 0
ReceiveIPv6 ReceiveFunc = 1
ReceiveDERP ReceiveFunc = 2
)
func (f ReceiveFunc) String() string {
if f < 0 || int(f) >= len(receiveNames) {
return fmt.Sprintf("ReceiveFunc(%d)", f)
}
return receiveNames[f]
}
var receiveNames = []string{
ReceiveIPv4: "ReceiveIPv4",
ReceiveIPv6: "ReceiveIPv6",
ReceiveDERP: "ReceiveDERP",
}
// Tracker tracks the health of various Tailscale subsystems,
// comparing each subsystems' state with each other to make sure
// they're consistent based on the user's intended state.
type Tracker struct {
// MagicSockReceiveFuncs tracks the state of the three
// magicsock receive functions: IPv4, IPv6, and DERP.
MagicSockReceiveFuncs [3]ReceiveFuncStats // indexed by ReceiveFunc values
// mu guards everything that follows.
mu sync.Mutex
warnables []*Warnable // keys ever set
warnableVal map[*Warnable]*warningState
// sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
// Deprecated: using Warnables should be preferred
sysErr map[Subsystem]error
watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes
timer *time.Timer
latestVersion *tailcfg.ClientVersion // or nil
checkForUpdates bool
inMapPoll bool
inMapPollSince time.Time
lastMapPollEndedAt time.Time
lastStreamedMapResponse time.Time
derpHomeRegion int
derpHomeless bool
derpRegionConnected map[int]bool
derpRegionHealthProblem map[int]string
derpRegionLastFrame map[int]time.Time
lastMapRequestHeard time.Time // time we got a 200 from control for a MapRequest
ipnState string
ipnWantRunning bool
anyInterfaceUp opt.Bool // empty means unknown (assume true)
udp4Unbound bool
controlHealth []string
lastLoginErr error
localLogConfigErr error
tlsConnectionErrors map[string]error // map[ServerName]error
}
// Subsystem is the name of a subsystem whose health can be monitored.
//
// Deprecated: Registering a Warnable using Register() and updating its health state
// with SetUnhealthy() and SetHealthy() should be preferred.
type Subsystem string
const (
// SysRouter is the name of the wgengine/router subsystem.
SysRouter = Subsystem("router")
// SysDNS is the name of the net/dns subsystem.
SysDNS = Subsystem("dns")
// SysDNSOS is the name of the net/dns OSConfigurator subsystem.
SysDNSOS = Subsystem("dns-os")
// SysDNSManager is the name of the net/dns manager subsystem.
SysDNSManager = Subsystem("dns-manager")
// SysTKA is the name of the tailnet key authority subsystem.
SysTKA = Subsystem("tailnet-lock")
)
var subsystemsWarnables = map[Subsystem]*Warnable{}
const legacyErrorArgKey = "LegacyError"
// Warnable() returns a Warnable representing a legacy Subsystem. This is used
// *temporarily* while we migrate the old health infrastructure based on
// Subsystems to the new Warnables architecture.
func (s Subsystem) Warnable() *Warnable {
if w, ok := subsystemsWarnables[s]; ok {
return w
} else {
w := Register(&Warnable{
Code: WarnableCode(s),
Severity: SeverityMedium,
Text: func(args Args) string {
return args[legacyErrorArgKey]
},
})
subsystemsWarnables[s] = w
return w
}
}
var registeredWarnables = map[WarnableCode]*Warnable{}
// Register registers a new Warnable with the health package and returns it.
// Register panics if the Warnable was already registered, because Warnables
// should be unique across the program.
func Register(w *Warnable) *Warnable {
if registeredWarnables[w.Code] != nil {
panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code))
}
mak.Set(&registeredWarnables, w.Code, w)
return w
}
// unregister removes a Warnable from the health package. It should only be used
// for testing purposes.
func unregister(w *Warnable) {
if registeredWarnables[w.Code] == nil {
panic(fmt.Sprintf("health: attempting to unregister Warnable %q that was not registered", w.Code))
}
delete(registeredWarnables, w.Code)
}
// WarnableCode is a string that distinguishes each Warnable from others. It is globally unique within
// the program.
type WarnableCode string
// A Warnable is something that we might want to warn the user about, or not. A Warnable is either
// in an healthy or unhealth state. A Warnable is unhealthy if the Tracker knows about a WarningState
// affecting the Warnable.
// In most cases, Warnables are components of the backend (for instance, "DNS" or "Magicsock").
// Warnables are similar to the Subsystem type previously used in this package, but they provide
// a unique identifying code for each Warnable, along with more metadata that makes it easier for
// a GUI to display the Warnable in a user-friendly way.
type Warnable struct {
// Code is a string that uniquely identifies this Warnable across the entire Tailscale backend,
// and can be mapped to a user-displayable localized string.
Code WarnableCode
// Title is a string that the GUI uses as title for any message involving this Warnable. The title
// should be short and fit in a single line.
Title string
// Text is a function that generates an extended string that the GUI will display to the user when
// this Warnable is in an unhealthy state. The function can use the Args map to provide dynamic
// information to the user.
Text func(args Args) string
// Severity is the severity of the Warnable, which the GUI can use to determine how to display it.
// For instance, a Warnable with SeverityHigh could trigger a modal view, while a Warnable with
// SeverityLow could be displayed in a less intrusive way.
// TODO(angott): turn this into a SeverityFunc, which allows the Warnable to change its severity based on
// the Args of the unhappy state, just like we do in the Text function.
Severity Severity
// DependsOn is a set of Warnables that this Warnable depends, on and need to be healthy
// before this Warnable can also be healthy again. The GUI can use this information to ignore
// this Warnable if one of its dependencies is unhealthy.
DependsOn []*Warnable
// MapDebugFlag is a MapRequest.DebugFlag that is sent to control when this Warnable is unhealthy
//
// Deprecated: this is only used in one case, and will be removed in a future PR
MapDebugFlag string
// If true, this warnable is related to configuration of networking stack
// on the machine that impacts connectivity.
ImpactsConnectivity bool
}
// StaticMessage returns a function that always returns the input string, to be used in
// simple Warnables that do not use the Args map to generate their Text.
func StaticMessage(s string) func(Args) string {
return func(Args) string { return s }
}
// nil reports whether t is nil.
// It exists to accept nil *Tracker receivers on all methods
// to at least not crash. But because a nil receiver indicates
// some lost Tracker plumbing, we want to capture stack trace
// samples when it occurs.
func (t *Tracker) nil() bool {
if t != nil {
return false
}
if cibuild.On() {
stack := make([]byte, 1<<10)
stack = stack[:runtime.Stack(stack, false)]
fmt.Fprintf(os.Stderr, "## WARNING: (non-fatal) nil health.Tracker (being strict in CI):\n%s\n", stack)
}
// TODO(bradfitz): open source our "unexpected" package
// and use it here to capture samples of stacks where
// t is nil.
return true
}
// Severity represents how serious an error is. Each GUI interprets this severity value in different ways,
// to surface the error in a more or less visible way. For instance, the macOS GUI could change its menubar
// icon to display an exclamation mark and present a modal notification for SeverityHigh warnings, but not
// for SeverityLow messages, which would only appear in the Settings window.
type Severity string
const (
SeverityHigh Severity = "high"
SeverityMedium Severity = "medium"
SeverityLow Severity = "low"
)
// Args is a map of Args to string values that can be used to provide parameters regarding
// the unhealthy state of a Warnable.
// For instance, if you have a Warnable to track the health of DNS lookups, here you can include
// the hostname that failed to resolve, or the IP address of the DNS server that has been failing
// to respond. You can then use these parameters in the Text function of the Warnable to provide a detailed
// error message to the user.
type Args map[Arg]string
// A warningState is a condition affecting a Warnable. For each Warnable known to the Tracker, a Warnable
// is in an unhappy state if there is a warningState associated with the Warnable.
type warningState struct {
BrokenSince time.Time // when the Warnable became unhealthy
Args Args // args can be used to provide parameters to the function that generates the Text in the Warnable
}
func (ws *warningState) Equal(other *warningState) bool {
if ws == nil && other == nil {
return true
}
if ws == nil || other == nil {
return false
}
return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args)
}
// SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be
// called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated.
// SetUnhealthy takes ownership of args. The args can be nil if no additional information is
// needed for the unhealthy state.
func (t *Tracker) SetUnhealthy(w *Warnable, args Args) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.setUnhealthyLocked(w, args)
}
func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
if w == nil {
return
}
// If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that
// BrokenSince time.
brokenSince := time.Now()
if existingWS := t.warnableVal[w]; existingWS != nil {
brokenSince = existingWS.BrokenSince
}
if t.warnableVal[w] == nil {
t.warnables = append(t.warnables, w)
}
ws := &warningState{
BrokenSince: brokenSince,
Args: args,
}
prevWs := t.warnableVal[w]
mak.Set(&t.warnableVal, w, ws)
if !ws.Equal(prevWs) {
for _, cb := range t.watchers {
go cb(w, w.unhealthyState(ws))
}
}
}
// SetHealthy removes any warningState for the given Warnable.
func (t *Tracker) SetHealthy(w *Warnable) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.setHealthyLocked(w)
}
func (t *Tracker) setHealthyLocked(w *Warnable) {
if t.warnableVal[w] == nil {
// Nothing to remove
return
}
delete(t.warnableVal, w)
for _, cb := range t.watchers {
go cb(w, nil)
}
}
// AppendWarnableDebugFlags appends to base any health items that are currently in failed
// state and were created with MapDebugFlag.
func (t *Tracker) AppendWarnableDebugFlags(base []string) []string {
if t.nil() {
return base
}
ret := base
t.mu.Lock()
defer t.mu.Unlock()
for w, err := range t.warnableVal {
if w.MapDebugFlag == "" {
continue
}
if err != nil {
ret = append(ret, w.MapDebugFlag)
}
}
sort.Strings(ret[len(base):]) // sort the new ones
return ret
}
// RegisterWatcher adds a function that will be called whenever the health state of any Warnable changes.
// If a Warnable becomes unhealthy or its unhealthy state is updated, the callback will be called with its
// current Representation.
// If a Warnable becomes healthy, the callback will be called with ws set to nil.
// The provided callback function will be executed in its own goroutine. The returned function can be used
// to unregister the callback.
func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unregister func()) {
if t.nil() {
return func() {}
}
t.mu.Lock()
defer t.mu.Unlock()
if t.watchers == nil {
t.watchers = set.HandleSet[func(*Warnable, *UnhealthyState)]{}
}
handle := t.watchers.Add(cb)
if t.timer == nil {
t.timer = time.AfterFunc(time.Minute, t.timerSelfCheck)
}
return func() {
t.mu.Lock()
defer t.mu.Unlock()
delete(t.watchers, handle)
if len(t.watchers) == 0 && t.timer != nil {
t.timer.Stop()
t.timer = nil
}
}
}
// SetRouterHealth sets the state of the wgengine/router.Router.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetRouterHealth(err error) { t.setErr(SysRouter, err) }
// RouterHealth returns the wgengine/router.Router error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) RouterHealth() error { return t.get(SysRouter) }
// SetDNSHealth sets the state of the net/dns.Manager
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetDNSHealth(err error) { t.setErr(SysDNS, err) }
// DNSHealth returns the net/dns.Manager error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) DNSHealth() error { return t.get(SysDNS) }
// SetDNSOSHealth sets the state of the net/dns.OSConfigurator
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetDNSOSHealth(err error) { t.setErr(SysDNSOS, err) }
// SetDNSManagerHealth sets the state of the Linux net/dns manager's
// discovery of the /etc/resolv.conf situation.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetDNSManagerHealth(err error) { t.setErr(SysDNSManager, err) }
// DNSOSHealth returns the net/dns.OSConfigurator error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) DNSOSHealth() error { return t.get(SysDNSOS) }
// SetTKAHealth sets the health of the tailnet key authority.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetTKAHealth(err error) { t.setErr(SysTKA, err) }
// TKAHealth returns the tailnet key authority error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) TKAHealth() error { return t.get(SysTKA) }
// SetLocalLogConfigHealth sets the error state of this client's local log configuration.
func (t *Tracker) SetLocalLogConfigHealth(err error) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.localLogConfigErr = err
}
// SetTLSConnectionError sets the error state for connections to a specific
// host. Setting the error to nil will clear any previously-set error.
func (t *Tracker) SetTLSConnectionError(host string, err error) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if err == nil {
delete(t.tlsConnectionErrors, host)
} else {
mak.Set(&t.tlsConnectionErrors, host, err)
}
}
func RegisterDebugHandler(typ string, h http.Handler) {
mu.Lock()
defer mu.Unlock()
mak.Set(&debugHandler, typ, h)
}
func DebugHandler(typ string) http.Handler {
mu.Lock()
defer mu.Unlock()
return debugHandler[typ]
}
func (t *Tracker) get(key Subsystem) error {
if t.nil() {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
return t.sysErr[key]
}
func (t *Tracker) setErr(key Subsystem, err error) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.setLocked(key, err)
}
func (t *Tracker) setLocked(key Subsystem, err error) {
if t.sysErr == nil {
t.sysErr = map[Subsystem]error{}
}
old, ok := t.sysErr[key]
if !ok && err == nil {
// Initial happy path.
t.sysErr[key] = nil
t.selfCheckLocked()
return
}
if ok && (old == nil) == (err == nil) {
// No change in overall error status (nil-vs-not), so
// don't run callbacks, but exact error might've
// changed, so note it.
if err != nil {
t.sysErr[key] = err
}
return
}
t.sysErr[key] = err
t.selfCheckLocked()
}
// updateLegacyErrorWarnableLocked takes a legacy Subsystem and an optional error, and
// updates the WarningState for that legacy Subsystem, setting it to healthy or unhealthy.
// It is used temporarily while we migrate from Subsystems to Warnables.
//
// Deprecated: this function will be removed after migrating all subsystem errors to use
// Warnables instead.
func (t *Tracker) updateLegacyErrorWarnableLocked(key Subsystem, err error) {
w := key.Warnable()
if err != nil {
t.setUnhealthyLocked(key.Warnable(), Args{legacyErrorArgKey: err.Error()})
} else {
t.setHealthyLocked(w)
}
}
func (t *Tracker) SetControlHealth(problems []string) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.controlHealth = problems
t.selfCheckLocked()
}
// GotStreamedMapResponse notes that we got a tailcfg.MapResponse
// message in streaming mode, even if it's just a keep-alive message.
//
// This also notes that a map poll is in progress. To unset that, call
// SetOutOfPollNetMap().
func (t *Tracker) GotStreamedMapResponse() {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.lastStreamedMapResponse = time.Now()
if !t.inMapPoll {
t.inMapPoll = true
t.inMapPollSince = time.Now()
}
t.selfCheckLocked()
}
// SetOutOfPollNetMap records that the client is no longer in
// an HTTP map request long poll to the control plane.
func (t *Tracker) SetOutOfPollNetMap() {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if !t.inMapPoll {
return
}
t.inMapPoll = false
t.lastMapPollEndedAt = time.Now()
t.selfCheckLocked()
}
// GetInPollNetMap reports whether the client has an open
// HTTP long poll open to the control plane.
func (t *Tracker) GetInPollNetMap() bool {
if t.nil() {
return false
}
t.mu.Lock()
defer t.mu.Unlock()
return t.inMapPoll
}
// SetMagicSockDERPHome notes what magicsock's view of its home DERP is.
//
// The homeless parameter is whether magicsock is running in DERP-disconnected
// mode, without discovering and maintaining a connection to its home DERP.
func (t *Tracker) SetMagicSockDERPHome(region int, homeless bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.derpHomeRegion = region
t.derpHomeless = homeless
t.selfCheckLocked()
}
// NoteMapRequestHeard notes whenever we successfully sent a map request
// to control for which we received a 200 response.
func (t *Tracker) NoteMapRequestHeard(mr *tailcfg.MapRequest) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
// TODO: extract mr.HostInfo.NetInfo.PreferredDERP, compare
// against SetMagicSockDERPHome and
// SetDERPRegionConnectedState
t.lastMapRequestHeard = time.Now()
t.selfCheckLocked()
}
func (t *Tracker) SetDERPRegionConnectedState(region int, connected bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
mak.Set(&t.derpRegionConnected, region, connected)
t.selfCheckLocked()
}
// SetDERPRegionHealth sets or clears any problem associated with the
// provided DERP region.
func (t *Tracker) SetDERPRegionHealth(region int, problem string) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if problem == "" {
delete(t.derpRegionHealthProblem, region)
} else {
mak.Set(&t.derpRegionHealthProblem, region, problem)
}
t.selfCheckLocked()
}
// NoteDERPRegionReceivedFrame is called to note that a frame was received from
// the given DERP region at the current time.
func (t *Tracker) NoteDERPRegionReceivedFrame(region int) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
mak.Set(&t.derpRegionLastFrame, region, time.Now())
t.selfCheckLocked()
}
// GetDERPRegionReceivedTime returns the last time that a frame was received
// from the given DERP region, or the zero time if no communication with that
// region has occurred.
func (t *Tracker) GetDERPRegionReceivedTime(region int) time.Time {
if t.nil() {
return time.Time{}
}
t.mu.Lock()
defer t.mu.Unlock()
return t.derpRegionLastFrame[region]
}
// state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc.
func (t *Tracker) SetIPNState(state string, wantRunning bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.ipnState = state
t.ipnWantRunning = wantRunning
t.selfCheckLocked()
}
// SetAnyInterfaceUp sets whether any network interface is up.
func (t *Tracker) SetAnyInterfaceUp(up bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.anyInterfaceUp.Set(up)
t.selfCheckLocked()
}
// SetUDP4Unbound sets whether the udp4 bind failed completely.
func (t *Tracker) SetUDP4Unbound(unbound bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.udp4Unbound = unbound
t.selfCheckLocked()
}
// SetAuthRoutineInError records the latest error encountered as a result of a
// login attempt. Providing a nil error indicates successful login, or that
// being logged in w/coordination is not currently desired.
func (t *Tracker) SetAuthRoutineInError(err error) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if err == nil && t.lastLoginErr == nil {
return
}
t.lastLoginErr = err
t.selfCheckLocked()
}
// SetLatestVersion records the latest version of the Tailscale client.
// v can be nil if unknown.
func (t *Tracker) SetLatestVersion(v *tailcfg.ClientVersion) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.latestVersion = v
t.selfCheckLocked()
}
// SetCheckForUpdates sets whether the client wants to check for updates.
func (t *Tracker) SetCheckForUpdates(v bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if t.checkForUpdates == v {
return
}
t.checkForUpdates = v
t.selfCheckLocked()
}
func (t *Tracker) timerSelfCheck() {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.checkReceiveFuncsLocked()
t.selfCheckLocked()
if t.timer != nil {
t.timer.Reset(time.Minute)
}
}
func (t *Tracker) selfCheckLocked() {
if t.ipnState == "" {
// Don't check yet.
return
}
t.updateBuiltinWarnablesLocked()
}
// OverallError returns a summary of the health state.
//
// If there are multiple problems, the error will be of type
// multierr.Error.
func (t *Tracker) OverallError() error {
if t.nil() {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
t.updateBuiltinWarnablesLocked()
return t.multiErrLocked()
}
// Strings() returns a string array containing the Text of all Warnings
// currently known to the Tracker. These strings can be presented to the
// user, although ideally you would use the Code property on each Warning
// to show a localized version of them instead.
// This function is here for legacy compatibility purposes and is deprecated.
func (t *Tracker) Strings() []string {
if t.nil() {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
return t.stringsLocked()
}
func (t *Tracker) stringsLocked() []string {
result := []string{}
for w, ws := range t.warnableVal {
if ws.Args == nil {
result = append(result, w.Text(Args{}))
} else {
result = append(result, w.Text(ws.Args))
}
}
return result
}
// errorsLocked returns an array of errors where each error is the Text
// of a Warning known to the Tracker.
// This function is here for legacy compatibility purposes and is deprecated.
func (t *Tracker) errorsLocked() []error {
strs := t.stringsLocked()
errs := []error{}
for _, str := range strs {
errs = append(errs, errors.New(str))
}
return errs
}
// multiErrLocked returns an error listing all errors known to the Tracker.
// This function is here for legacy compatibility purposes and is deprecated.
func (t *Tracker) multiErrLocked() error {
errs := t.errorsLocked()
return multierr.New(errs...)
}
var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR")
// updateBuiltinWarnablesLocked performs a number of checks on the state of the backend,
// and adds/removes Warnings from the Tracker as needed.
func (t *Tracker) updateBuiltinWarnablesLocked() {
if t.checkForUpdates {
if cv := t.latestVersion; cv != nil && !cv.RunningLatest && cv.LatestVersion != "" {
if cv.UrgentSecurityUpdate {
t.setUnhealthyLocked(securityUpdateAvailableWarnable, Args{
ArgCurrentVersion: version.Short(),
ArgAvailableVersion: cv.LatestVersion,
})
} else {
t.setUnhealthyLocked(updateAvailableWarnable, Args{
ArgCurrentVersion: version.Short(),
ArgAvailableVersion: cv.LatestVersion,
})
}
}
}
if version.IsUnstableBuild() {
t.setUnhealthyLocked(unstableWarnable, Args{
ArgCurrentVersion: version.Short(),
})
}
if v, ok := t.anyInterfaceUp.Get(); ok && !v {
t.setUnhealthyLocked(NetworkStatusWarnable, nil)
return
} else {
t.setHealthyLocked(NetworkStatusWarnable)
}
if t.localLogConfigErr != nil {
t.setUnhealthyLocked(localLogWarnable, Args{
ArgError: t.localLogConfigErr.Error(),
})
return
} else {
t.setHealthyLocked(localLogWarnable)
}
if !t.ipnWantRunning {
t.setUnhealthyLocked(IPNStateWarnable, Args{
"State": t.ipnState,
})
return
} else {
t.setHealthyLocked(IPNStateWarnable)
}
if t.lastLoginErr != nil {
t.setUnhealthyLocked(LoginStateWarnable, Args{
ArgError: t.lastLoginErr.Error(),
})
return
} else {
t.setHealthyLocked(LoginStateWarnable)
}
now := time.Now()
if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) {
t.setUnhealthyLocked(notInMapPollWarnable, nil)
return
} else {
t.setHealthyLocked(notInMapPollWarnable)
}
const tooIdle = 2*time.Minute + 5*time.Second
if d := now.Sub(t.lastStreamedMapResponse).Round(time.Second); d > tooIdle {
t.setUnhealthyLocked(mapResponseTimeoutWarnable, Args{
ArgDuration: d.String(),
})
return
} else {
t.setHealthyLocked(mapResponseTimeoutWarnable)
}
if !t.derpHomeless {
rid := t.derpHomeRegion
if rid == 0 {
t.setUnhealthyLocked(noDERPHomeWarnable, nil)
return
} else if !t.derpRegionConnected[rid] {
t.setUnhealthyLocked(noDERPConnectionWarnable, Args{
ArgRegionID: fmt.Sprint(rid),
})
return
} else if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
t.setUnhealthyLocked(derpTimeoutWarnable, Args{
ArgRegionID: fmt.Sprint(rid),
ArgDuration: d.String(),
})
return
}
}
t.setHealthyLocked(noDERPHomeWarnable)
t.setHealthyLocked(noDERPConnectionWarnable)
t.setHealthyLocked(derpTimeoutWarnable)
if t.udp4Unbound {
t.setUnhealthyLocked(noUDP4BindWarnable, nil)
return
} else {
t.setHealthyLocked(noUDP4BindWarnable)
}
// TODO: use
_ = t.inMapPollSince
_ = t.lastMapPollEndedAt
_ = t.lastStreamedMapResponse
_ = t.lastMapRequestHeard
shouldClearMagicsockWarnings := false
for i := range t.MagicSockReceiveFuncs {
f := &t.MagicSockReceiveFuncs[i]
if f.missing {
t.setUnhealthyLocked(magicsockReceiveFuncWarnable, Args{
ArgMagicsockFunctionName: f.name,
})
shouldClearMagicsockWarnings = false
}
}
if shouldClearMagicsockWarnings {
t.setHealthyLocked(magicsockReceiveFuncWarnable)
}
// Iterates over the legacy subsystems and their error, and turns them into structured errors
for sys, err := range t.sysErr {
t.updateLegacyErrorWarnableLocked(sys, err)
}
if len(t.derpRegionHealthProblem) > 0 {
for regionID, problem := range t.derpRegionHealthProblem {
t.setUnhealthyLocked(derpRegionErrorWarnable, Args{
ArgRegionID: fmt.Sprint(regionID),
ArgError: problem,
})
}
} else {
t.setHealthyLocked(derpRegionErrorWarnable)
}
if len(t.controlHealth) > 0 {
for _, s := range t.controlHealth {
t.setUnhealthyLocked(controlHealthWarnable, Args{
ArgError: s,
})
}
} else {
t.setHealthyLocked(controlHealthWarnable)
}
if err := envknob.ApplyDiskConfigError(); err != nil {
t.setUnhealthyLocked(applyDiskConfigWarnable, Args{
ArgError: err.Error(),
})
} else {
t.setHealthyLocked(applyDiskConfigWarnable)
}
if len(t.tlsConnectionErrors) > 0 {
for serverName, err := range t.tlsConnectionErrors {
t.setUnhealthyLocked(tlsConnectionFailedWarnable, Args{
ArgServerName: serverName,
ArgError: err.Error(),
})
}
} else {
t.setHealthyLocked(tlsConnectionFailedWarnable)
}
if e := fakeErrForTesting(); len(t.warnables) == 0 && e != "" {
t.setUnhealthyLocked(testWarnable, Args{
ArgError: e,
})
} else {
t.setHealthyLocked(testWarnable)
}
}
// ReceiveFuncStats tracks the calls made to a wireguard-go receive func.
type ReceiveFuncStats struct {
// name is the name of the receive func.
// It's lazily populated.
name string
// numCalls is the number of times the receive func has ever been called.
// It is required because it is possible for a receive func's wireguard-go goroutine
// to be active even though the receive func isn't.
// The wireguard-go goroutine alternates between calling the receive func and
// processing what the func returned.
numCalls atomic.Uint64
// prevNumCalls is the value of numCalls last time the health check examined it.
prevNumCalls uint64
// inCall indicates whether the receive func is currently running.
inCall atomic.Bool
// missing indicates whether the receive func is not running.
missing bool
}
func (s *ReceiveFuncStats) Enter() {
s.numCalls.Add(1)
s.inCall.Store(true)
}
func (s *ReceiveFuncStats) Exit() {
s.inCall.Store(false)
}
// ReceiveFuncStats returns the ReceiveFuncStats tracker for the given func
// type.
//
// If t is nil, it returns nil.
func (t *Tracker) ReceiveFuncStats(which ReceiveFunc) *ReceiveFuncStats {
if t == nil {
return nil
}
return &t.MagicSockReceiveFuncs[which]
}
func (t *Tracker) checkReceiveFuncsLocked() {
for i := range t.MagicSockReceiveFuncs {
f := &t.MagicSockReceiveFuncs[i]
if f.name == "" {
f.name = (ReceiveFunc(i)).String()
}
if runtime.GOOS == "js" && i < 2 {
// Skip IPv4 and IPv6 on js.
continue
}
f.missing = false
prev := f.prevNumCalls
numCalls := f.numCalls.Load()
f.prevNumCalls = numCalls
if numCalls > prev {
// OK: the function has gotten called since last we checked
continue
}
if f.inCall.Load() {
// OK: the function is active, probably blocked due to inactivity
continue
}
// Not OK: The function is not active, and not accumulating new calls.
// It is probably MIA.
f.missing = true
}
}