2023-01-27 13:37:20 -08:00
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
2021-02-18 08:58:13 -08:00
// Package health is a registry for other packages to report & check
// overall health status of the node.
package health
import (
2021-03-15 22:20:48 -07:00
"errors"
"fmt"
2021-12-21 13:52:50 -08:00
"net/http"
2024-04-26 08:06:06 -07:00
"os"
2021-10-22 09:12:00 -07:00
"runtime"
2021-03-15 22:20:48 -07:00
"sort"
2021-02-18 08:58:13 -08:00
"sync"
2021-04-26 17:08:05 -07:00
"sync/atomic"
2021-02-24 21:29:51 -08:00
"time"
2022-01-24 10:52:57 -08:00
"tailscale.com/envknob"
2021-02-24 21:29:51 -08:00
"tailscale.com/tailcfg"
2024-04-25 13:24:49 -07:00
"tailscale.com/types/opt"
2024-04-26 08:06:06 -07:00
"tailscale.com/util/cibuild"
2024-04-25 13:24:49 -07:00
"tailscale.com/util/mak"
2021-11-02 14:30:48 -07:00
"tailscale.com/util/multierr"
2022-11-28 10:34:35 -08:00
"tailscale.com/util/set"
2024-05-01 13:54:56 -07:00
"tailscale.com/version"
2021-02-18 08:58:13 -08:00
)
var (
2024-04-25 13:24:49 -07:00
mu sync . Mutex
debugHandler map [ string ] http . Handler
)
2024-04-26 17:24:04 -07:00
// ReceiveFunc is one of the three magicsock Receive funcs (IPv4, IPv6, or
// DERP).
type ReceiveFunc int
// ReceiveFunc indices for Tracker.MagicSockReceiveFuncs.
const (
ReceiveIPv4 ReceiveFunc = 0
ReceiveIPv6 ReceiveFunc = 1
ReceiveDERP ReceiveFunc = 2
)
func ( f ReceiveFunc ) String ( ) string {
if f < 0 || int ( f ) >= len ( receiveNames ) {
return fmt . Sprintf ( "ReceiveFunc(%d)" , f )
}
return receiveNames [ f ]
}
var receiveNames = [ ] string {
ReceiveIPv4 : "ReceiveIPv4" ,
ReceiveIPv6 : "ReceiveIPv6" ,
ReceiveDERP : "ReceiveDERP" ,
}
2024-04-26 10:12:46 -07:00
// Tracker tracks the health of various Tailscale subsystems,
// comparing each subsystems' state with each other to make sure
// they're consistent based on the user's intended state.
2024-04-25 13:24:49 -07:00
type Tracker struct {
2024-04-26 17:24:04 -07:00
// MagicSockReceiveFuncs tracks the state of the three
// magicsock receive functions: IPv4, IPv6, and DERP.
MagicSockReceiveFuncs [ 3 ] ReceiveFuncStats // indexed by ReceiveFunc values
// mu guards everything that follows.
2021-02-24 21:29:51 -08:00
mu sync . Mutex
2024-04-25 14:25:48 -07:00
warnables [ ] * Warnable // keys ever set
warnableVal map [ * Warnable ] error
sysErr map [ Subsystem ] error // subsystem => err (or nil for no error)
watchers set . HandleSet [ func ( Subsystem , error ) ] // opt func to run if error state changes
timer * time . Timer
2021-02-24 21:29:51 -08:00
2024-05-01 13:54:56 -07:00
latestVersion * tailcfg . ClientVersion // or nil
checkForUpdates bool
2021-02-24 21:29:51 -08:00
inMapPoll bool
inMapPollSince time . Time
lastMapPollEndedAt time . Time
lastStreamedMapResponse time . Time
derpHomeRegion int
2023-11-16 13:43:36 -08:00
derpHomeless bool
2024-04-25 13:24:49 -07:00
derpRegionConnected map [ int ] bool
derpRegionHealthProblem map [ int ] string
derpRegionLastFrame map [ int ] time . Time
2021-02-24 21:29:51 -08:00
lastMapRequestHeard time . Time // time we got a 200 from control for a MapRequest
ipnState string
ipnWantRunning bool
2024-04-25 13:24:49 -07:00
anyInterfaceUp opt . Bool // empty means unknown (assume true)
2021-04-28 10:36:54 -07:00
udp4Unbound bool
2021-09-18 12:59:55 -07:00
controlHealth [ ] string
2022-06-03 10:52:07 -07:00
lastLoginErr error
2022-10-18 14:54:07 -06:00
localLogConfigErr error
2024-04-25 13:24:49 -07:00
tlsConnectionErrors map [ string ] error // map[ServerName]error
}
2021-02-18 08:58:13 -08:00
2021-03-15 22:20:48 -07:00
// Subsystem is the name of a subsystem whose health can be monitored.
type Subsystem string
const (
// SysOverall is the name representing the overall health of
// the system, rather than one particular subsystem.
SysOverall = Subsystem ( "overall" )
2021-04-28 10:36:54 -07:00
// SysRouter is the name of the wgengine/router subsystem.
2021-03-15 22:20:48 -07:00
SysRouter = Subsystem ( "router" )
2021-04-02 19:31:58 -07:00
// SysDNS is the name of the net/dns subsystem.
SysDNS = Subsystem ( "dns" )
2021-11-18 15:52:21 -08:00
// SysDNSOS is the name of the net/dns OSConfigurator subsystem.
SysDNSOS = Subsystem ( "dns-os" )
2022-02-15 06:59:15 -08:00
// SysDNSManager is the name of the net/dns manager subsystem.
SysDNSManager = Subsystem ( "dns-manager" )
2023-01-04 10:36:07 -08:00
// SysTKA is the name of the tailnet key authority subsystem.
SysTKA = Subsystem ( "tailnet-lock" )
2022-11-13 07:32:37 -08:00
)
2022-02-15 06:59:15 -08:00
2024-04-25 14:25:48 -07:00
// NewWarnable returns a new warnable item that the caller can mark as health or
// in warning state via Tracker.SetWarnable.
//
// NewWarnable is generally called in init and stored in a package global. It
// can be used by multiple Trackers.
func NewWarnable ( opts ... WarnableOpt ) * Warnable {
2022-11-13 07:32:37 -08:00
w := new ( Warnable )
for _ , o := range opts {
o . mod ( w )
}
return w
}
2022-11-02 13:13:26 -07:00
2022-11-13 07:32:37 -08:00
// WarnableOpt is an option passed to NewWarnable.
type WarnableOpt interface {
mod ( * Warnable )
}
// WithMapDebugFlag returns a WarnableOpt for NewWarnable that makes the returned
// Warnable report itself to the coordination server as broken with this
// string in MapRequest.DebugFlag when Set to a non-nil value.
func WithMapDebugFlag ( name string ) WarnableOpt {
return warnOptFunc ( func ( w * Warnable ) {
w . debugFlag = name
} )
}
2024-01-03 00:23:58 +00:00
// WithConnectivityImpact returns an option which makes a Warnable annotated as
// something that could be breaking external network connectivity on the
// machine. This will make the warnable returned by OverallError alongside
// network connectivity errors.
func WithConnectivityImpact ( ) WarnableOpt {
return warnOptFunc ( func ( w * Warnable ) {
w . hasConnectivityImpact = true
} )
}
2022-11-13 07:32:37 -08:00
type warnOptFunc func ( * Warnable )
func ( f warnOptFunc ) mod ( w * Warnable ) { f ( w ) }
// Warnable is a health check item that may or may not be in a bad warning state.
2024-04-25 14:25:48 -07:00
// The caller of NewWarnable is responsible for calling Tracker.SetWarnable to update the state.
2022-11-13 07:32:37 -08:00
type Warnable struct {
debugFlag string // optional MapRequest.DebugFlag to send when unhealthy
2024-01-03 00:23:58 +00:00
// If true, this warning is related to configuration of networking stack
// on the machine that impacts connectivity.
hasConnectivityImpact bool
2022-11-13 07:32:37 -08:00
}
2024-04-25 20:26:49 -07:00
// nil reports whether t is nil.
// It exists to accept nil *Tracker receivers on all methods
// to at least not crash. But because a nil receiver indicates
// some lost Tracker plumbing, we want to capture stack trace
// samples when it occurs.
func ( t * Tracker ) nil ( ) bool {
if t != nil {
return false
}
2024-04-26 08:06:06 -07:00
if cibuild . On ( ) {
stack := make ( [ ] byte , 1 << 10 )
stack = stack [ : runtime . Stack ( stack , false ) ]
fmt . Fprintf ( os . Stderr , "## WARNING: (non-fatal) nil health.Tracker (being strict in CI):\n%s\n" , stack )
}
2024-04-25 20:26:49 -07:00
// TODO(bradfitz): open source our "unexpected" package
// and use it here to capture samples of stacks where
// t is nil.
return true
}
2022-11-13 07:32:37 -08:00
// Set updates the Warnable's state.
// If non-nil, it's considered unhealthy.
2024-04-25 14:25:48 -07:00
func ( t * Tracker ) SetWarnable ( w * Warnable , err error ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 14:25:48 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
l0 := len ( t . warnableVal )
mak . Set ( & t . warnableVal , w , err )
if len ( t . warnableVal ) != l0 {
t . warnables = append ( t . warnables , w )
2022-11-13 07:32:37 -08:00
}
}
// AppendWarnableDebugFlags appends to base any health items that are currently in failed
// state and were created with MapDebugFlag.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) AppendWarnableDebugFlags ( base [ ] string ) [ ] string {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return base
}
2022-11-13 07:32:37 -08:00
ret := base
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
2024-04-25 14:25:48 -07:00
for w , err := range t . warnableVal {
2022-11-13 07:32:37 -08:00
if w . debugFlag == "" {
continue
}
2024-04-25 14:25:48 -07:00
if err != nil {
2022-11-13 07:32:37 -08:00
ret = append ( ret , w . debugFlag )
}
}
sort . Strings ( ret [ len ( base ) : ] ) // sort the new ones
return ret
}
2021-03-15 22:20:48 -07:00
2021-02-18 08:58:13 -08:00
// RegisterWatcher adds a function that will be called if an
// error changes state either to unhealthy or from unhealthy. It is
// not called on transition from unknown to healthy. It must be non-nil
// and is run in its own goroutine. The returned func unregisters it.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) RegisterWatcher ( cb func ( key Subsystem , err error ) ) ( unregister func ( ) ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return func ( ) { }
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
if t . watchers == nil {
t . watchers = set . HandleSet [ func ( Subsystem , error ) ] { }
}
handle := t . watchers . Add ( cb )
if t . timer == nil {
t . timer = time . AfterFunc ( time . Minute , t . timerSelfCheck )
2021-03-15 22:20:48 -07:00
}
2021-02-18 08:58:13 -08:00
return func ( ) {
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
delete ( t . watchers , handle )
if len ( t . watchers ) == 0 && t . timer != nil {
t . timer . Stop ( )
t . timer = nil
2021-03-15 22:20:48 -07:00
}
2021-02-18 08:58:13 -08:00
}
}
2021-04-02 19:31:58 -07:00
// SetRouterHealth sets the state of the wgengine/router.Router.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetRouterHealth ( err error ) { t . setErr ( SysRouter , err ) }
2021-02-18 08:58:13 -08:00
// RouterHealth returns the wgengine/router.Router error state.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) RouterHealth ( ) error { return t . get ( SysRouter ) }
2021-02-18 08:58:13 -08:00
2021-04-02 19:31:58 -07:00
// SetDNSHealth sets the state of the net/dns.Manager
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetDNSHealth ( err error ) { t . setErr ( SysDNS , err ) }
2021-04-02 19:31:58 -07:00
// DNSHealth returns the net/dns.Manager error state.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) DNSHealth ( ) error { return t . get ( SysDNS ) }
2021-04-02 19:31:58 -07:00
2021-11-18 15:52:21 -08:00
// SetDNSOSHealth sets the state of the net/dns.OSConfigurator
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetDNSOSHealth ( err error ) { t . setErr ( SysDNSOS , err ) }
2021-11-18 15:52:21 -08:00
2022-02-15 06:59:15 -08:00
// SetDNSManagerHealth sets the state of the Linux net/dns manager's
// discovery of the /etc/resolv.conf situation.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetDNSManagerHealth ( err error ) { t . setErr ( SysDNSManager , err ) }
2022-02-15 06:59:15 -08:00
2021-11-18 15:52:21 -08:00
// DNSOSHealth returns the net/dns.OSConfigurator error state.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) DNSOSHealth ( ) error { return t . get ( SysDNSOS ) }
2021-11-18 15:52:21 -08:00
2023-01-04 10:36:07 -08:00
// SetTKAHealth sets the health of the tailnet key authority.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetTKAHealth ( err error ) { t . setErr ( SysTKA , err ) }
2023-01-04 10:36:07 -08:00
// TKAHealth returns the tailnet key authority error state.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) TKAHealth ( ) error { return t . get ( SysTKA ) }
2023-01-04 10:36:07 -08:00
2022-10-18 14:54:07 -06:00
// SetLocalLogConfigHealth sets the error state of this client's local log configuration.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetLocalLogConfigHealth ( err error ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . localLogConfigErr = err
2022-10-18 14:54:07 -06:00
}
2023-02-01 14:29:44 -05:00
// SetTLSConnectionError sets the error state for connections to a specific
// host. Setting the error to nil will clear any previously-set error.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetTLSConnectionError ( host string , err error ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
2023-02-01 14:29:44 -05:00
if err == nil {
2024-04-25 13:24:49 -07:00
delete ( t . tlsConnectionErrors , host )
2023-02-01 14:29:44 -05:00
} else {
2024-04-25 13:24:49 -07:00
mak . Set ( & t . tlsConnectionErrors , host , err )
2023-02-01 14:29:44 -05:00
}
}
2021-12-21 13:52:50 -08:00
func RegisterDebugHandler ( typ string , h http . Handler ) {
mu . Lock ( )
defer mu . Unlock ( )
2024-04-25 13:24:49 -07:00
mak . Set ( & debugHandler , typ , h )
2021-12-21 13:52:50 -08:00
}
func DebugHandler ( typ string ) http . Handler {
mu . Lock ( )
defer mu . Unlock ( )
return debugHandler [ typ ]
}
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) get ( key Subsystem ) error {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return nil
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
return t . sysErr [ key ]
2021-02-18 08:58:13 -08:00
}
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) setErr ( key Subsystem , err error ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . setLocked ( key , err )
2021-03-15 22:20:48 -07:00
}
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) setLocked ( key Subsystem , err error ) {
if t . sysErr == nil {
t . sysErr = map [ Subsystem ] error { }
}
old , ok := t . sysErr [ key ]
2021-02-18 08:58:13 -08:00
if ! ok && err == nil {
// Initial happy path.
2024-04-25 13:24:49 -07:00
t . sysErr [ key ] = nil
t . selfCheckLocked ( )
2021-02-18 08:58:13 -08:00
return
}
if ok && ( old == nil ) == ( err == nil ) {
// No change in overall error status (nil-vs-not), so
// don't run callbacks, but exact error might've
// changed, so note it.
if err != nil {
2024-04-25 13:24:49 -07:00
t . sysErr [ key ] = err
2021-02-18 08:58:13 -08:00
}
return
}
2024-04-25 13:24:49 -07:00
t . sysErr [ key ] = err
t . selfCheckLocked ( )
for _ , cb := range t . watchers {
2021-02-18 08:58:13 -08:00
go cb ( key , err )
}
}
2021-02-24 21:29:51 -08:00
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetControlHealth ( problems [ ] string ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . controlHealth = problems
t . selfCheckLocked ( )
2021-09-18 12:59:55 -07:00
}
2021-02-24 21:29:51 -08:00
// GotStreamedMapResponse notes that we got a tailcfg.MapResponse
// message in streaming mode, even if it's just a keep-alive message.
2023-08-30 08:57:55 -07:00
//
// This also notes that a map poll is in progress. To unset that, call
// SetOutOfPollNetMap().
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) GotStreamedMapResponse ( ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . lastStreamedMapResponse = time . Now ( )
if ! t . inMapPoll {
t . inMapPoll = true
t . inMapPollSince = time . Now ( )
2023-08-30 08:57:55 -07:00
}
2024-04-25 13:24:49 -07:00
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
2023-08-30 08:57:55 -07:00
// SetOutOfPollNetMap records that the client is no longer in
// an HTTP map request long poll to the control plane.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetOutOfPollNetMap ( ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
if ! t . inMapPoll {
2021-02-24 21:29:51 -08:00
return
}
2024-04-25 13:24:49 -07:00
t . inMapPoll = false
t . lastMapPollEndedAt = time . Now ( )
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
2021-12-16 08:06:32 -08:00
// GetInPollNetMap reports whether the client has an open
// HTTP long poll open to the control plane.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) GetInPollNetMap ( ) bool {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return false
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
return t . inMapPoll
2021-12-16 08:06:32 -08:00
}
2021-02-24 21:29:51 -08:00
// SetMagicSockDERPHome notes what magicsock's view of its home DERP is.
2023-11-16 13:43:36 -08:00
//
// The homeless parameter is whether magicsock is running in DERP-disconnected
// mode, without discovering and maintaining a connection to its home DERP.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetMagicSockDERPHome ( region int , homeless bool ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . derpHomeRegion = region
t . derpHomeless = homeless
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
// NoteMapRequestHeard notes whenever we successfully sent a map request
// to control for which we received a 200 response.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) NoteMapRequestHeard ( mr * tailcfg . MapRequest ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
2021-02-24 21:29:51 -08:00
// TODO: extract mr.HostInfo.NetInfo.PreferredDERP, compare
// against SetMagicSockDERPHome and
// SetDERPRegionConnectedState
2024-04-25 13:24:49 -07:00
t . lastMapRequestHeard = time . Now ( )
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetDERPRegionConnectedState ( region int , connected bool ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
mak . Set ( & t . derpRegionConnected , region , connected )
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
2021-09-01 19:27:22 -07:00
// SetDERPRegionHealth sets or clears any problem associated with the
// provided DERP region.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetDERPRegionHealth ( region int , problem string ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
2021-09-01 19:27:22 -07:00
if problem == "" {
2024-04-25 13:24:49 -07:00
delete ( t . derpRegionHealthProblem , region )
2021-09-01 19:27:22 -07:00
} else {
2024-04-25 13:24:49 -07:00
mak . Set ( & t . derpRegionHealthProblem , region , problem )
2021-09-01 19:27:22 -07:00
}
2024-04-25 13:24:49 -07:00
t . selfCheckLocked ( )
2021-09-01 19:27:22 -07:00
}
2023-12-08 15:03:15 -05:00
// NoteDERPRegionReceivedFrame is called to note that a frame was received from
// the given DERP region at the current time.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) NoteDERPRegionReceivedFrame ( region int ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
mak . Set ( & t . derpRegionLastFrame , region , time . Now ( ) )
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
2023-12-08 15:03:15 -05:00
// GetDERPRegionReceivedTime returns the last time that a frame was received
// from the given DERP region, or the zero time if no communication with that
// region has occurred.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) GetDERPRegionReceivedTime ( region int ) time . Time {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return time . Time { }
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
return t . derpRegionLastFrame [ region ]
2023-12-08 15:03:15 -05:00
}
2021-02-24 21:29:51 -08:00
// state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetIPNState ( state string , wantRunning bool ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . ipnState = state
t . ipnWantRunning = wantRunning
t . selfCheckLocked ( )
2021-02-24 21:29:51 -08:00
}
2021-03-22 21:41:53 -07:00
// SetAnyInterfaceUp sets whether any network interface is up.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetAnyInterfaceUp ( up bool ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . anyInterfaceUp . Set ( up )
t . selfCheckLocked ( )
2021-03-22 21:41:53 -07:00
}
2021-04-28 10:36:54 -07:00
// SetUDP4Unbound sets whether the udp4 bind failed completely.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetUDP4Unbound ( unbound bool ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . udp4Unbound = unbound
t . selfCheckLocked ( )
2021-04-28 10:36:54 -07:00
}
2022-06-03 10:52:07 -07:00
// SetAuthRoutineInError records the latest error encountered as a result of a
// login attempt. Providing a nil error indicates successful login, or that
// being logged in w/coordination is not currently desired.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) SetAuthRoutineInError ( err error ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
2024-05-01 13:54:56 -07:00
if err == nil && t . lastLoginErr == nil {
return
}
2024-04-25 13:24:49 -07:00
t . lastLoginErr = err
2024-05-01 13:54:56 -07:00
t . selfCheckLocked ( )
}
// SetLatestVersion records the latest version of the Tailscale client.
// v can be nil if unknown.
func ( t * Tracker ) SetLatestVersion ( v * tailcfg . ClientVersion ) {
if t . nil ( ) {
return
}
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . latestVersion = v
t . selfCheckLocked ( )
}
// SetCheckForUpdates sets whether the client wants to check for updates.
func ( t * Tracker ) SetCheckForUpdates ( v bool ) {
if t . nil ( ) {
return
}
t . mu . Lock ( )
defer t . mu . Unlock ( )
if t . checkForUpdates == v {
return
}
t . checkForUpdates = v
t . selfCheckLocked ( )
2022-06-03 10:52:07 -07:00
}
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) timerSelfCheck ( ) {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
2024-04-26 17:24:04 -07:00
t . checkReceiveFuncsLocked ( )
2024-04-25 13:24:49 -07:00
t . selfCheckLocked ( )
if t . timer != nil {
t . timer . Reset ( time . Minute )
2021-03-15 22:20:48 -07:00
}
}
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) selfCheckLocked ( ) {
if t . ipnState == "" {
2021-03-15 22:20:48 -07:00
// Don't check yet.
return
}
2024-04-25 13:24:49 -07:00
t . setLocked ( SysOverall , t . overallErrorLocked ( ) )
2021-03-15 22:20:48 -07:00
}
2024-05-01 13:54:56 -07:00
// AppendWarnings appends all current health warnings to dst and returns the
// result.
func ( t * Tracker ) AppendWarnings ( dst [ ] string ) [ ] string {
err := t . OverallError ( )
if err == nil {
return dst
}
if me , ok := err . ( multierr . Error ) ; ok {
for _ , err := range me . Errors ( ) {
dst = append ( dst , err . Error ( ) )
}
} else {
dst = append ( dst , err . Error ( ) )
}
return dst
}
2021-09-01 19:27:22 -07:00
// OverallError returns a summary of the health state.
//
// If there are multiple problems, the error will be of type
2021-11-02 14:30:48 -07:00
// multierr.Error.
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) OverallError ( ) error {
2024-04-25 20:26:49 -07:00
if t . nil ( ) {
return nil
}
2024-04-25 13:24:49 -07:00
t . mu . Lock ( )
defer t . mu . Unlock ( )
return t . overallErrorLocked ( )
2021-09-01 19:27:22 -07:00
}
2022-09-14 12:49:39 -07:00
var fakeErrForTesting = envknob . RegisterString ( "TS_DEBUG_FAKE_HEALTH_ERROR" )
2021-09-16 11:24:44 -07:00
2024-04-25 14:25:48 -07:00
// networkErrorfLocked creates an error that indicates issues with outgoing network
2024-01-03 00:23:58 +00:00
// connectivity. Any active warnings related to network connectivity will
// automatically be appended to it.
2024-04-25 14:25:48 -07:00
//
// t.mu must be held.
func ( t * Tracker ) networkErrorfLocked ( format string , a ... any ) error {
2024-01-03 00:23:58 +00:00
errs := [ ] error {
fmt . Errorf ( format , a ... ) ,
}
2024-04-25 14:25:48 -07:00
for _ , w := range t . warnables {
2024-01-03 00:23:58 +00:00
if ! w . hasConnectivityImpact {
continue
}
2024-04-25 14:25:48 -07:00
if err := t . warnableVal [ w ] ; err != nil {
2024-01-03 00:23:58 +00:00
errs = append ( errs , err )
}
}
if len ( errs ) == 1 {
return errs [ 0 ]
}
return multierr . New ( errs ... )
}
2024-04-25 13:24:49 -07:00
var errNetworkDown = errors . New ( "network down" )
var errNotInMapPoll = errors . New ( "not in map poll" )
2024-01-03 00:23:58 +00:00
var errNoDERPHome = errors . New ( "no DERP home" )
2024-04-25 13:24:49 -07:00
var errNoUDP4Bind = errors . New ( "no udp4 bind" )
2024-05-01 13:54:56 -07:00
var errUnstable = errors . New ( "This is an unstable (development) version of Tailscale; frequent updates and bugs are likely" )
2024-01-03 00:23:58 +00:00
2024-04-25 13:24:49 -07:00
func ( t * Tracker ) overallErrorLocked ( ) error {
2024-05-01 13:54:56 -07:00
var errs [ ] error
add := func ( err error ) {
if err != nil {
errs = append ( errs , err )
}
}
merged := func ( ) error {
return multierr . New ( errs ... )
}
if t . checkForUpdates {
if cv := t . latestVersion ; cv != nil && ! cv . RunningLatest && cv . LatestVersion != "" {
if cv . UrgentSecurityUpdate {
add ( fmt . Errorf ( "Security update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update" , version . Short ( ) , cv . LatestVersion ) )
} else {
add ( fmt . Errorf ( "Update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update" , version . Short ( ) , cv . LatestVersion ) )
}
}
}
if version . IsUnstableBuild ( ) {
add ( errUnstable )
}
2024-04-25 13:24:49 -07:00
if v , ok := t . anyInterfaceUp . Get ( ) ; ok && ! v {
2024-05-01 13:54:56 -07:00
add ( errNetworkDown )
return merged ( )
2021-03-22 21:41:53 -07:00
}
2024-04-25 13:24:49 -07:00
if t . localLogConfigErr != nil {
2024-05-01 13:54:56 -07:00
add ( t . localLogConfigErr )
return merged ( )
2022-10-18 14:54:07 -06:00
}
2024-04-25 13:24:49 -07:00
if ! t . ipnWantRunning {
2024-05-01 13:54:56 -07:00
add ( fmt . Errorf ( "state=%v, wantRunning=%v" , t . ipnState , t . ipnWantRunning ) )
return merged ( )
2021-03-15 22:20:48 -07:00
}
2024-04-25 13:24:49 -07:00
if t . lastLoginErr != nil {
2024-05-01 13:54:56 -07:00
add ( fmt . Errorf ( "not logged in, last login error=%v" , t . lastLoginErr ) )
return merged ( )
2022-06-03 10:52:07 -07:00
}
2021-03-15 22:20:48 -07:00
now := time . Now ( )
2024-04-25 13:24:49 -07:00
if ! t . inMapPoll && ( t . lastMapPollEndedAt . IsZero ( ) || now . Sub ( t . lastMapPollEndedAt ) > 10 * time . Second ) {
2024-05-01 13:54:56 -07:00
add ( errNotInMapPoll )
return merged ( )
2021-03-15 22:20:48 -07:00
}
const tooIdle = 2 * time . Minute + 5 * time . Second
2024-04-25 13:24:49 -07:00
if d := now . Sub ( t . lastStreamedMapResponse ) . Round ( time . Second ) ; d > tooIdle {
2024-05-01 13:54:56 -07:00
add ( t . networkErrorfLocked ( "no map response in %v" , d ) )
return merged ( )
2021-03-15 22:20:48 -07:00
}
2024-04-25 13:24:49 -07:00
if ! t . derpHomeless {
rid := t . derpHomeRegion
2023-11-16 13:43:36 -08:00
if rid == 0 {
2024-05-01 13:54:56 -07:00
add ( errNoDERPHome )
return merged ( )
2023-11-16 13:43:36 -08:00
}
2024-04-25 13:24:49 -07:00
if ! t . derpRegionConnected [ rid ] {
2024-05-01 13:54:56 -07:00
add ( t . networkErrorfLocked ( "not connected to home DERP region %v" , rid ) )
return merged ( )
2023-11-16 13:43:36 -08:00
}
2024-04-25 13:24:49 -07:00
if d := now . Sub ( t . derpRegionLastFrame [ rid ] ) . Round ( time . Second ) ; d > tooIdle {
2024-05-01 13:54:56 -07:00
add ( t . networkErrorfLocked ( "haven't heard from home DERP region %v in %v" , rid , d ) )
return merged ( )
2023-11-16 13:43:36 -08:00
}
2021-03-15 22:20:48 -07:00
}
2024-04-25 13:24:49 -07:00
if t . udp4Unbound {
2024-05-01 13:54:56 -07:00
add ( errNoUDP4Bind )
return merged ( )
2021-04-28 10:36:54 -07:00
}
2021-03-15 22:20:48 -07:00
// TODO: use
2024-04-25 13:24:49 -07:00
_ = t . inMapPollSince
_ = t . lastMapPollEndedAt
_ = t . lastStreamedMapResponse
_ = t . lastMapRequestHeard
2021-03-15 22:20:48 -07:00
2024-04-26 17:24:04 -07:00
for i := range t . MagicSockReceiveFuncs {
f := & t . MagicSockReceiveFuncs [ i ]
if f . missing {
errs = append ( errs , fmt . Errorf ( "%s is not running" , f . name ) )
2021-04-26 17:08:05 -07:00
}
}
2024-04-25 13:24:49 -07:00
for sys , err := range t . sysErr {
2021-03-15 22:20:48 -07:00
if err == nil || sys == SysOverall {
continue
}
errs = append ( errs , fmt . Errorf ( "%v: %w" , sys , err ) )
}
2024-04-25 14:25:48 -07:00
for _ , w := range t . warnables {
if err := t . warnableVal [ w ] ; err != nil {
2022-11-13 07:32:37 -08:00
errs = append ( errs , err )
}
}
2024-04-25 13:24:49 -07:00
for regionID , problem := range t . derpRegionHealthProblem {
2021-09-01 19:27:22 -07:00
errs = append ( errs , fmt . Errorf ( "derp%d: %v" , regionID , problem ) )
}
2024-04-25 13:24:49 -07:00
for _ , s := range t . controlHealth {
2021-09-18 12:59:55 -07:00
errs = append ( errs , errors . New ( s ) )
}
2022-09-16 20:24:28 -07:00
if err := envknob . ApplyDiskConfigError ( ) ; err != nil {
errs = append ( errs , err )
}
2024-04-25 13:24:49 -07:00
for serverName , err := range t . tlsConnectionErrors {
2023-02-01 14:29:44 -05:00
errs = append ( errs , fmt . Errorf ( "TLS connection error for %q: %w" , serverName , err ) )
}
2022-09-14 12:49:39 -07:00
if e := fakeErrForTesting ( ) ; len ( errs ) == 0 && e != "" {
2021-09-16 11:24:44 -07:00
return errors . New ( e )
}
2021-03-15 22:20:48 -07:00
sort . Slice ( errs , func ( i , j int ) bool {
// Not super efficient (stringifying these in a sort), but probably max 2 or 3 items.
return errs [ i ] . Error ( ) < errs [ j ] . Error ( )
} )
2021-11-02 14:30:48 -07:00
return multierr . New ( errs ... )
2021-02-24 21:29:51 -08:00
}
2021-04-26 17:08:05 -07:00
// ReceiveFuncStats tracks the calls made to a wireguard-go receive func.
type ReceiveFuncStats struct {
// name is the name of the receive func.
2024-04-26 17:24:04 -07:00
// It's lazily populated.
2021-04-26 17:08:05 -07:00
name string
// numCalls is the number of times the receive func has ever been called.
// It is required because it is possible for a receive func's wireguard-go goroutine
// to be active even though the receive func isn't.
// The wireguard-go goroutine alternates between calling the receive func and
// processing what the func returned.
2024-04-26 17:24:04 -07:00
numCalls atomic . Uint64
2021-04-26 17:08:05 -07:00
// prevNumCalls is the value of numCalls last time the health check examined it.
prevNumCalls uint64
// inCall indicates whether the receive func is currently running.
2024-04-26 17:24:04 -07:00
inCall atomic . Bool
2021-04-26 17:08:05 -07:00
// missing indicates whether the receive func is not running.
missing bool
}
func ( s * ReceiveFuncStats ) Enter ( ) {
2024-04-26 17:24:04 -07:00
s . numCalls . Add ( 1 )
s . inCall . Store ( true )
2021-04-26 17:08:05 -07:00
}
func ( s * ReceiveFuncStats ) Exit ( ) {
2024-04-26 17:24:04 -07:00
s . inCall . Store ( false )
}
// ReceiveFuncStats returns the ReceiveFuncStats tracker for the given func
// type.
//
// If t is nil, it returns nil.
func ( t * Tracker ) ReceiveFuncStats ( which ReceiveFunc ) * ReceiveFuncStats {
if t == nil {
return nil
}
return & t . MagicSockReceiveFuncs [ which ]
2021-04-26 17:08:05 -07:00
}
2024-04-26 17:24:04 -07:00
func ( t * Tracker ) checkReceiveFuncsLocked ( ) {
for i := range t . MagicSockReceiveFuncs {
f := & t . MagicSockReceiveFuncs [ i ]
if f . name == "" {
f . name = ( ReceiveFunc ( i ) ) . String ( )
}
if runtime . GOOS == "js" && i < 2 {
// Skip IPv4 and IPv6 on js.
continue
}
f . missing = false
prev := f . prevNumCalls
numCalls := f . numCalls . Load ( )
f . prevNumCalls = numCalls
2021-04-26 17:08:05 -07:00
if numCalls > prev {
// OK: the function has gotten called since last we checked
continue
}
2024-04-26 17:24:04 -07:00
if f . inCall . Load ( ) {
2021-04-26 17:08:05 -07:00
// OK: the function is active, probably blocked due to inactivity
continue
}
// Not OK: The function is not active, and not accumulating new calls.
// It is probably MIA.
2024-04-26 17:24:04 -07:00
f . missing = true
2021-04-26 17:08:05 -07:00
}
}