control/controlclient,health,tailcfg: refactor control health messages (#15839)

* control/controlclient,health,tailcfg: refactor control health messages

Updates tailscale/corp#27759

Signed-off-by: James Sanderson <jsanderson@tailscale.com>
Signed-off-by: Paul Scott <408401+icio@users.noreply.github.com>
Co-authored-by: Paul Scott <408401+icio@users.noreply.github.com>
This commit is contained in:
James 'zofrex' Sanderson
2025-05-22 13:40:32 +01:00
committed by GitHub
parent 980ab4244d
commit aa8bc23c49
12 changed files with 495 additions and 122 deletions

View File

@@ -88,34 +88,35 @@ type Tracker struct {
// sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
// Deprecated: using Warnables should be preferred
sysErr map[Subsystem]error
watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes
watchers set.HandleSet[func(Change)] // opt func to run if error state changes
timer tstime.TimerController
latestVersion *tailcfg.ClientVersion // or nil
checkForUpdates bool
applyUpdates opt.Bool
inMapPoll bool
inMapPollSince time.Time
lastMapPollEndedAt time.Time
lastStreamedMapResponse time.Time
lastNoiseDial time.Time
derpHomeRegion int
derpHomeless bool
derpRegionConnected map[int]bool
derpRegionHealthProblem map[int]string
derpRegionLastFrame map[int]time.Time
derpMap *tailcfg.DERPMap // last DERP map from control, could be nil if never received one
lastMapRequestHeard time.Time // time we got a 200 from control for a MapRequest
ipnState string
ipnWantRunning bool
ipnWantRunningLastTrue time.Time // when ipnWantRunning last changed false -> true
anyInterfaceUp opt.Bool // empty means unknown (assume true)
controlHealth []string
lastLoginErr error
localLogConfigErr error
tlsConnectionErrors map[string]error // map[ServerName]error
metricHealthMessage *metrics.MultiLabelMap[metricHealthMessageLabel]
inMapPoll bool
inMapPollSince time.Time
lastMapPollEndedAt time.Time
lastStreamedMapResponse time.Time
lastNoiseDial time.Time
derpHomeRegion int
derpHomeless bool
derpRegionConnected map[int]bool
derpRegionHealthProblem map[int]string
derpRegionLastFrame map[int]time.Time
derpMap *tailcfg.DERPMap // last DERP map from control, could be nil if never received one
lastMapRequestHeard time.Time // time we got a 200 from control for a MapRequest
ipnState string
ipnWantRunning bool
ipnWantRunningLastTrue time.Time // when ipnWantRunning last changed false -> true
anyInterfaceUp opt.Bool // empty means unknown (assume true)
lastNotifiedControlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages processed, kept for change detection
controlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages received
lastLoginErr error
localLogConfigErr error
tlsConnectionErrors map[string]error // map[ServerName]error
metricHealthMessage *metrics.MultiLabelMap[metricHealthMessageLabel]
}
func (t *Tracker) now() time.Time {
@@ -207,13 +208,15 @@ func unregister(w *Warnable) {
// the program.
type WarnableCode string
// A Warnable is something that we might want to warn the user about, or not. A Warnable is either
// in an healthy or unhealth state. A Warnable is unhealthy if the Tracker knows about a WarningState
// affecting the Warnable.
// In most cases, Warnables are components of the backend (for instance, "DNS" or "Magicsock").
// Warnables are similar to the Subsystem type previously used in this package, but they provide
// a unique identifying code for each Warnable, along with more metadata that makes it easier for
// a GUI to display the Warnable in a user-friendly way.
// A Warnable is something that we might want to warn the user about, or not. A
// Warnable is either in a healthy or unhealthy state. A Warnable is unhealthy if
// the Tracker knows about a WarningState affecting the Warnable.
//
// In most cases, Warnables are components of the backend (for instance, "DNS"
// or "Magicsock"). Warnables are similar to the Subsystem type previously used
// in this package, but they provide a unique identifying code for each
// Warnable, along with more metadata that makes it easier for a GUI to display
// the Warnable in a user-friendly way.
type Warnable struct {
// Code is a string that uniquely identifies this Warnable across the entire Tailscale backend,
// and can be mapped to a user-displayable localized string.
@@ -409,12 +412,18 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
prevWs := t.warnableVal[w]
mak.Set(&t.warnableVal, w, ws)
if !ws.Equal(prevWs) {
change := Change{
WarnableChanged: true,
Warnable: w,
UnhealthyState: w.unhealthyState(ws),
}
for _, cb := range t.watchers {
// If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be
// executed immediately. Otherwise, the callback should be enqueued to run once the Warnable
// becomes visible.
if w.IsVisible(ws, t.now) {
cb(w, w.unhealthyState(ws))
cb(change)
continue
}
@@ -427,7 +436,7 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
// Check if the Warnable is still unhealthy, as it could have become healthy between the time
// the timer was set for and the time it was executed.
if t.warnableVal[w] != nil {
cb(w, w.unhealthyState(ws))
cb(change)
delete(t.pendingVisibleTimers, w)
}
})
@@ -460,8 +469,23 @@ func (t *Tracker) setHealthyLocked(w *Warnable) {
delete(t.pendingVisibleTimers, w)
}
change := Change{
WarnableChanged: true,
Warnable: w,
}
for _, cb := range t.watchers {
cb(w, nil)
cb(change)
}
}
// notifyWatchersControlChangedLocked calls each watcher to signal that control
// health messages have changed (and should be fetched via CurrentState).
func (t *Tracker) notifyWatchersControlChangedLocked() {
change := Change{
ControlHealthChanged: true,
}
for _, cb := range t.watchers {
cb(change)
}
}
@@ -488,23 +512,57 @@ func (t *Tracker) AppendWarnableDebugFlags(base []string) []string {
return ret
}
// RegisterWatcher adds a function that will be called whenever the health state of any Warnable changes.
// If a Warnable becomes unhealthy or its unhealthy state is updated, the callback will be called with its
// current Representation.
// If a Warnable becomes healthy, the callback will be called with ws set to nil.
// The provided callback function will be executed in its own goroutine. The returned function can be used
// to unregister the callback.
func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unregister func()) {
return t.registerSyncWatcher(func(w *Warnable, r *UnhealthyState) {
go cb(w, r)
// Change is used to communicate a change to health. This could either be due to
// a Warnable changing from health to unhealthy (or vice-versa), or because the
// health messages received from the control-plane have changed.
//
// Exactly one *Changed field will be true.
type Change struct {
// ControlHealthChanged indicates it was health messages from the
// control-plane server that changed.
ControlHealthChanged bool
// WarnableChanged indicates it was a client Warnable which changed state.
WarnableChanged bool
// Warnable is whose health changed, as indicated in UnhealthyState.
Warnable *Warnable
// UnhealthyState is set if the changed Warnable is now unhealthy, or nil
// if Warnable is now healthy.
UnhealthyState *UnhealthyState
}
// RegisterWatcher adds a function that will be called its own goroutine
// whenever the health state of any client [Warnable] or control-plane health
// messages changes. The returned function can be used to unregister the
// callback.
//
// If a client [Warnable] becomes unhealthy or its unhealthy state is updated,
// the callback will be called with WarnableChanged set to true and the Warnable
// and its UnhealthyState:
//
// go cb(Change{WarnableChanged: true, Warnable: w, UnhealthyState: us})
//
// If a Warnable becomes healthy, the callback will be called with
// WarnableChanged set to true, the Warnable set, and UnhealthyState set to nil:
//
// go cb(Change{WarnableChanged: true, Warnable: w, UnhealthyState: nil})
//
// If the health messages from the control-plane change, the callback will be
// called with ControlHealthChanged set to true. Recipients can fetch the set of
// control-plane health messages by calling [Tracker.CurrentState]:
//
// go cb(Change{ControlHealthChanged: true})
func (t *Tracker) RegisterWatcher(cb func(Change)) (unregister func()) {
return t.registerSyncWatcher(func(c Change) {
go cb(c)
})
}
// registerSyncWatcher adds a function that will be called whenever the health
// state of any Warnable changes. The provided callback function will be
// executed synchronously. Call RegisterWatcher to register any callbacks that
// won't return from execution immediately.
func (t *Tracker) registerSyncWatcher(cb func(w *Warnable, r *UnhealthyState)) (unregister func()) {
// state changes. The provided callback function will be executed synchronously.
// Call RegisterWatcher to register any callbacks that won't return from
// execution immediately.
func (t *Tracker) registerSyncWatcher(cb func(c Change)) (unregister func()) {
if t.nil() {
return func() {}
}
@@ -512,7 +570,7 @@ func (t *Tracker) registerSyncWatcher(cb func(w *Warnable, r *UnhealthyState)) (
t.mu.Lock()
defer t.mu.Unlock()
if t.watchers == nil {
t.watchers = set.HandleSet[func(*Warnable, *UnhealthyState)]{}
t.watchers = set.HandleSet[func(Change)]{}
}
handle := t.watchers.Add(cb)
if t.timer == nil {
@@ -659,13 +717,15 @@ func (t *Tracker) updateLegacyErrorWarnableLocked(key Subsystem, err error) {
}
}
func (t *Tracker) SetControlHealth(problems []string) {
func (t *Tracker) SetControlHealth(problems map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.controlHealth = problems
t.controlMessages = problems
t.selfCheckLocked()
}
@@ -961,11 +1021,11 @@ func (t *Tracker) OverallError() error {
return t.multiErrLocked()
}
// Strings() returns a string array containing the Text of all Warnings
// currently known to the Tracker. These strings can be presented to the
// user, although ideally you would use the Code property on each Warning
// to show a localized version of them instead.
// This function is here for legacy compatibility purposes and is deprecated.
// Strings() returns a string array containing the Text of all Warnings and
// ControlHealth messages currently known to the Tracker. These strings can be
// presented to the user, although ideally you would use the Code property on
// each Warning to show a localized version of them instead. This function is
// here for legacy compatibility purposes and is deprecated.
func (t *Tracker) Strings() []string {
if t.nil() {
return nil
@@ -991,6 +1051,19 @@ func (t *Tracker) stringsLocked() []string {
result = append(result, w.Text(ws.Args))
}
}
warnLen := len(result)
for _, c := range t.controlMessages {
if c.Title != "" && c.Text != "" {
result = append(result, c.Title+": "+c.Text)
} else if c.Title != "" {
result = append(result, c.Title)
} else if c.Text != "" {
result = append(result, c.Text)
}
}
sort.Strings(result[warnLen:])
return result
}
@@ -1171,14 +1244,10 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
t.setHealthyLocked(derpRegionErrorWarnable)
}
if len(t.controlHealth) > 0 {
for _, s := range t.controlHealth {
t.setUnhealthyLocked(controlHealthWarnable, Args{
ArgError: s,
})
}
} else {
t.setHealthyLocked(controlHealthWarnable)
// Check if control health messages have changed
if !maps.EqualFunc(t.lastNotifiedControlMessages, t.controlMessages, tailcfg.DisplayMessage.Equal) {
t.lastNotifiedControlMessages = t.controlMessages
t.notifyWatchersControlChangedLocked()
}
if err := envknob.ApplyDiskConfigError(); err != nil {

View File

@@ -5,12 +5,14 @@ package health
import (
"fmt"
"maps"
"reflect"
"slices"
"strconv"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/types/opt"
@@ -25,6 +27,7 @@ func TestAppendWarnableDebugFlags(t *testing.T) {
w := Register(&Warnable{
Code: WarnableCode(fmt.Sprintf("warnable-code-%d", i)),
MapDebugFlag: fmt.Sprint(i),
Text: StaticMessage(""),
})
defer unregister(w)
if i%2 == 0 {
@@ -114,7 +117,9 @@ func TestWatcher(t *testing.T) {
becameUnhealthy := make(chan struct{})
becameHealthy := make(chan struct{})
watcherFunc := func(w *Warnable, us *UnhealthyState) {
watcherFunc := func(c Change) {
w := c.Warnable
us := c.UnhealthyState
if w != testWarnable {
t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, testWarnable)
}
@@ -184,7 +189,9 @@ func TestSetUnhealthyWithTimeToVisible(t *testing.T) {
becameUnhealthy := make(chan struct{})
becameHealthy := make(chan struct{})
watchFunc := func(w *Warnable, us *UnhealthyState) {
watchFunc := func(c Change) {
w := c.Warnable
us := c.UnhealthyState
if w != mw {
t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, w)
}
@@ -457,21 +464,94 @@ func TestControlHealth(t *testing.T) {
ht.SetIPNState("NeedsLogin", true)
ht.GotStreamedMapResponse()
ht.SetControlHealth([]string{"Test message"})
state := ht.CurrentState()
warning, ok := state.Warnings["control-health"]
baseWarns := ht.CurrentState().Warnings
baseStrs := ht.Strings()
if !ok {
t.Fatal("no warning found in current state with code 'control-health'")
}
if got, want := warning.Title, "Coordination server reports an issue"; got != want {
t.Errorf("warning.Title = %q, want %q", got, want)
}
if got, want := warning.Severity, SeverityMedium; got != want {
t.Errorf("warning.Severity = %s, want %s", got, want)
}
if got, want := warning.Text, "The coordination server is reporting an health issue: Test message"; got != want {
t.Errorf("warning.Text = %q, want %q", got, want)
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"control-health-test": {
Title: "Control health message",
Text: "Extra help",
},
"control-health-title": {
Title: "Control health title only",
},
})
t.Run("Warnings", func(t *testing.T) {
wantWarns := map[WarnableCode]UnhealthyState{
"control-health-test": {
WarnableCode: "control-health-test",
Severity: SeverityMedium,
Title: "Control health message",
Text: "Extra help",
},
"control-health-title": {
WarnableCode: "control-health-title",
Severity: SeverityMedium,
Title: "Control health title only",
},
}
state := ht.CurrentState()
gotWarns := maps.Clone(state.Warnings)
for k := range gotWarns {
if _, inBase := baseWarns[k]; inBase {
delete(gotWarns, k)
}
}
if diff := cmp.Diff(wantWarns, gotWarns); diff != "" {
t.Fatalf(`CurrentState().Warnings["control-health-*"] wrong (-want +got):\n%s`, diff)
}
})
t.Run("Strings()", func(t *testing.T) {
wantStrs := []string{
"Control health message: Extra help",
"Control health title only",
}
var gotStrs []string
for _, s := range ht.Strings() {
if !slices.Contains(baseStrs, s) {
gotStrs = append(gotStrs, s)
}
}
if diff := cmp.Diff(wantStrs, gotStrs); diff != "" {
t.Fatalf(`Strings() wrong (-want +got):\n%s`, diff)
}
})
t.Run("tailscaled_health_messages", func(t *testing.T) {
var r usermetric.Registry
ht.SetMetricsRegistry(&r)
got := ht.metricHealthMessage.Get(metricHealthMessageLabel{
Type: MetricLabelWarning,
}).String()
want := strconv.Itoa(
2 + // from SetControlHealth
len(baseStrs),
)
if got != want {
t.Errorf("metricsHealthMessage.Get(warning) = %q, want %q", got, want)
}
})
}
func TestControlHealthNotifiesOnSet(t *testing.T) {
ht := Tracker{}
ht.SetIPNState("NeedsLogin", true)
ht.GotStreamedMapResponse()
gotNotified := false
ht.registerSyncWatcher(func(_ Change) {
gotNotified = true
})
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test": {},
})
if !gotNotified {
t.Errorf("watcher did not get called, want it to be called")
}
}
@@ -480,12 +560,45 @@ func TestControlHealthNotifiesOnChange(t *testing.T) {
ht.SetIPNState("NeedsLogin", true)
ht.GotStreamedMapResponse()
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test-1": {},
})
gotNotified := false
ht.registerSyncWatcher(func(_ *Warnable, _ *UnhealthyState) {
ht.registerSyncWatcher(func(_ Change) {
gotNotified = true
})
ht.SetControlHealth([]string{"Test message"})
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test-2": {},
})
if !gotNotified {
t.Errorf("watcher did not get called, want it to be called")
}
}
func TestControlHealthNotifiesOnDetailsChange(t *testing.T) {
ht := Tracker{}
ht.SetIPNState("NeedsLogin", true)
ht.GotStreamedMapResponse()
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test-1": {
Title: "Title",
},
})
gotNotified := false
ht.registerSyncWatcher(func(_ Change) {
gotNotified = true
})
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test-1": {
Title: "Updated title",
},
})
if !gotNotified {
t.Errorf("watcher did not get called, want it to be called")
@@ -498,16 +611,20 @@ func TestControlHealthNoNotifyOnUnchanged(t *testing.T) {
ht.GotStreamedMapResponse()
// Set up an existing control health issue
ht.SetControlHealth([]string{"Test message"})
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test": {},
})
// Now register our watcher
gotNotified := false
ht.registerSyncWatcher(func(_ *Warnable, _ *UnhealthyState) {
ht.registerSyncWatcher(func(_ Change) {
gotNotified = true
})
// Send the same control health message again - should not notify
ht.SetControlHealth([]string{"Test message"})
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"test": {},
})
if gotNotified {
t.Errorf("watcher got called, want it to not be called")
@@ -519,11 +636,13 @@ func TestControlHealthIgnoredOutsideMapPoll(t *testing.T) {
ht.SetIPNState("NeedsLogin", true)
gotNotified := false
ht.registerSyncWatcher(func(_ *Warnable, _ *UnhealthyState) {
ht.registerSyncWatcher(func(_ Change) {
gotNotified = true
})
ht.SetControlHealth([]string{"Test message"})
ht.SetControlHealth(map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage{
"control-health": {},
})
state := ht.CurrentState()
_, ok := state.Warnings["control-health"]

View File

@@ -5,6 +5,8 @@ package health
import (
"time"
"tailscale.com/tailcfg"
)
// State contains the health status of the backend, and is
@@ -21,7 +23,8 @@ type State struct {
}
// UnhealthyState contains information to be shown to the user to inform them
// that a Warnable is currently unhealthy.
// that a [Warnable] is currently unhealthy or [tailcfg.DisplayMessage] is being
// sent from the control-plane.
type UnhealthyState struct {
WarnableCode WarnableCode
Severity Severity
@@ -98,11 +101,34 @@ func (t *Tracker) CurrentState() *State {
wm[w.Code] = *w.unhealthyState(ws)
}
for id, msg := range t.lastNotifiedControlMessages {
code := WarnableCode(id)
wm[code] = UnhealthyState{
WarnableCode: code,
Severity: severityFromTailcfg(msg.Severity),
Title: msg.Title,
Text: msg.Text,
ImpactsConnectivity: msg.ImpactsConnectivity,
// TODO(tailscale/corp#27759): DependsOn?
}
}
return &State{
Warnings: wm,
}
}
func severityFromTailcfg(s tailcfg.DisplayMessageSeverity) Severity {
switch s {
case tailcfg.SeverityHigh:
return SeverityHigh
case tailcfg.SeverityLow:
return SeverityLow
default:
return SeverityMedium
}
}
// isEffectivelyHealthyLocked reports whether w is effectively healthy.
// That means it's either actually healthy or it has a dependency that
// that's unhealthy, so we should treat w as healthy to not spam users

View File

@@ -238,16 +238,6 @@ var applyDiskConfigWarnable = Register(&Warnable{
},
})
// controlHealthWarnable is a Warnable that warns the user that the coordination server is reporting an health issue.
var controlHealthWarnable = Register(&Warnable{
Code: "control-health",
Title: "Coordination server reports an issue",
Severity: SeverityMedium,
Text: func(args Args) string {
return fmt.Sprintf("The coordination server is reporting an health issue: %v", args[ArgError])
},
})
// warmingUpWarnableDuration is the duration for which the warmingUpWarnable is reported by the backend after the user
// has changed ipnWantRunning to true from false.
const warmingUpWarnableDuration = 5 * time.Second