health: relax no-derp-home warnable to not fire if not in map poll

Fixes #14687

Change-Id: I05035df7e075e94dd39b2192bee34d878c15310d
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2025-01-27 14:21:25 +00:00 committed by Brad Fitzpatrick
parent bfde8079a0
commit bd9725c5f8
3 changed files with 88 additions and 23 deletions

View File

@ -22,6 +22,7 @@ import (
"tailscale.com/envknob" "tailscale.com/envknob"
"tailscale.com/metrics" "tailscale.com/metrics"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
"tailscale.com/tstime"
"tailscale.com/types/opt" "tailscale.com/types/opt"
"tailscale.com/util/cibuild" "tailscale.com/util/cibuild"
"tailscale.com/util/mak" "tailscale.com/util/mak"
@ -73,6 +74,8 @@ type Tracker struct {
// mu should not be held during init. // mu should not be held during init.
initOnce sync.Once initOnce sync.Once
testClock tstime.Clock // nil means use time.Now / tstime.StdClock{}
// mu guards everything that follows. // mu guards everything that follows.
mu sync.Mutex mu sync.Mutex
@ -80,13 +83,13 @@ type Tracker struct {
warnableVal map[*Warnable]*warningState warnableVal map[*Warnable]*warningState
// pendingVisibleTimers contains timers for Warnables that are unhealthy, but are // pendingVisibleTimers contains timers for Warnables that are unhealthy, but are
// not visible to the user yet, because they haven't been unhealthy for TimeToVisible // not visible to the user yet, because they haven't been unhealthy for TimeToVisible
pendingVisibleTimers map[*Warnable]*time.Timer pendingVisibleTimers map[*Warnable]tstime.TimerController
// sysErr maps subsystems to their current error (or nil if the subsystem is healthy) // sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
// Deprecated: using Warnables should be preferred // Deprecated: using Warnables should be preferred
sysErr map[Subsystem]error sysErr map[Subsystem]error
watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes
timer *time.Timer timer tstime.TimerController
latestVersion *tailcfg.ClientVersion // or nil latestVersion *tailcfg.ClientVersion // or nil
checkForUpdates bool checkForUpdates bool
@ -115,6 +118,20 @@ type Tracker struct {
metricHealthMessage *metrics.MultiLabelMap[metricHealthMessageLabel] metricHealthMessage *metrics.MultiLabelMap[metricHealthMessageLabel]
} }
func (t *Tracker) now() time.Time {
if t.testClock != nil {
return t.testClock.Now()
}
return time.Now()
}
func (t *Tracker) clock() tstime.Clock {
if t.testClock != nil {
return t.testClock
}
return tstime.StdClock{}
}
// Subsystem is the name of a subsystem whose health can be monitored. // Subsystem is the name of a subsystem whose health can be monitored.
// //
// Deprecated: Registering a Warnable using Register() and updating its health state // Deprecated: Registering a Warnable using Register() and updating its health state
@ -311,11 +328,11 @@ func (ws *warningState) Equal(other *warningState) bool {
// IsVisible returns whether the Warnable should be visible to the user, based on the TimeToVisible // IsVisible returns whether the Warnable should be visible to the user, based on the TimeToVisible
// field of the Warnable and the BrokenSince time when the Warnable became unhealthy. // field of the Warnable and the BrokenSince time when the Warnable became unhealthy.
func (w *Warnable) IsVisible(ws *warningState) bool { func (w *Warnable) IsVisible(ws *warningState, clockNow func() time.Time) bool {
if ws == nil || w.TimeToVisible == 0 { if ws == nil || w.TimeToVisible == 0 {
return true return true
} }
return time.Since(ws.BrokenSince) >= w.TimeToVisible return clockNow().Sub(ws.BrokenSince) >= w.TimeToVisible
} }
// SetMetricsRegistry sets up the metrics for the Tracker. It takes // SetMetricsRegistry sets up the metrics for the Tracker. It takes
@ -365,7 +382,7 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
// If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that // If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that
// BrokenSince time. // BrokenSince time.
brokenSince := time.Now() brokenSince := t.now()
if existingWS := t.warnableVal[w]; existingWS != nil { if existingWS := t.warnableVal[w]; existingWS != nil {
brokenSince = existingWS.BrokenSince brokenSince = existingWS.BrokenSince
} }
@ -384,15 +401,15 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
// If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be // If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be
// executed immediately. Otherwise, the callback should be enqueued to run once the Warnable // executed immediately. Otherwise, the callback should be enqueued to run once the Warnable
// becomes visible. // becomes visible.
if w.IsVisible(ws) { if w.IsVisible(ws, t.now) {
go cb(w, w.unhealthyState(ws)) go cb(w, w.unhealthyState(ws))
continue continue
} }
// The time remaining until the Warnable will be visible to the user is the TimeToVisible // The time remaining until the Warnable will be visible to the user is the TimeToVisible
// minus the time that has already passed since the Warnable became unhealthy. // minus the time that has already passed since the Warnable became unhealthy.
visibleIn := w.TimeToVisible - time.Since(brokenSince) visibleIn := w.TimeToVisible - t.now().Sub(brokenSince)
mak.Set(&t.pendingVisibleTimers, w, time.AfterFunc(visibleIn, func() { var tc tstime.TimerController = t.clock().AfterFunc(visibleIn, func() {
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
// Check if the Warnable is still unhealthy, as it could have become healthy between the time // Check if the Warnable is still unhealthy, as it could have become healthy between the time
@ -401,7 +418,8 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
go cb(w, w.unhealthyState(ws)) go cb(w, w.unhealthyState(ws))
delete(t.pendingVisibleTimers, w) delete(t.pendingVisibleTimers, w)
} }
})) })
mak.Set(&t.pendingVisibleTimers, w, tc)
} }
} }
} }
@ -476,7 +494,7 @@ func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unre
} }
handle := t.watchers.Add(cb) handle := t.watchers.Add(cb)
if t.timer == nil { if t.timer == nil {
t.timer = time.AfterFunc(time.Minute, t.timerSelfCheck) t.timer = t.clock().AfterFunc(time.Minute, t.timerSelfCheck)
} }
return func() { return func() {
t.mu.Lock() t.mu.Lock()
@ -640,10 +658,10 @@ func (t *Tracker) GotStreamedMapResponse() {
} }
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
t.lastStreamedMapResponse = time.Now() t.lastStreamedMapResponse = t.now()
if !t.inMapPoll { if !t.inMapPoll {
t.inMapPoll = true t.inMapPoll = true
t.inMapPollSince = time.Now() t.inMapPollSince = t.now()
} }
t.selfCheckLocked() t.selfCheckLocked()
} }
@ -660,7 +678,7 @@ func (t *Tracker) SetOutOfPollNetMap() {
return return
} }
t.inMapPoll = false t.inMapPoll = false
t.lastMapPollEndedAt = time.Now() t.lastMapPollEndedAt = t.now()
t.selfCheckLocked() t.selfCheckLocked()
} }
@ -702,7 +720,7 @@ func (t *Tracker) NoteMapRequestHeard(mr *tailcfg.MapRequest) {
// against SetMagicSockDERPHome and // against SetMagicSockDERPHome and
// SetDERPRegionConnectedState // SetDERPRegionConnectedState
t.lastMapRequestHeard = time.Now() t.lastMapRequestHeard = t.now()
t.selfCheckLocked() t.selfCheckLocked()
} }
@ -740,7 +758,7 @@ func (t *Tracker) NoteDERPRegionReceivedFrame(region int) {
} }
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
mak.Set(&t.derpRegionLastFrame, region, time.Now()) mak.Set(&t.derpRegionLastFrame, region, t.now())
t.selfCheckLocked() t.selfCheckLocked()
} }
@ -799,9 +817,9 @@ func (t *Tracker) SetIPNState(state string, wantRunning bool) {
// The first time we see wantRunning=true and it used to be false, it means the user requested // The first time we see wantRunning=true and it used to be false, it means the user requested
// the backend to start. We store this timestamp and use it to silence some warnings that are // the backend to start. We store this timestamp and use it to silence some warnings that are
// expected during startup. // expected during startup.
t.ipnWantRunningLastTrue = time.Now() t.ipnWantRunningLastTrue = t.now()
t.setUnhealthyLocked(warmingUpWarnable, nil) t.setUnhealthyLocked(warmingUpWarnable, nil)
time.AfterFunc(warmingUpWarnableDuration, func() { t.clock().AfterFunc(warmingUpWarnableDuration, func() {
t.mu.Lock() t.mu.Lock()
t.updateWarmingUpWarnableLocked() t.updateWarmingUpWarnableLocked()
t.mu.Unlock() t.mu.Unlock()
@ -938,7 +956,7 @@ func (t *Tracker) Strings() []string {
func (t *Tracker) stringsLocked() []string { func (t *Tracker) stringsLocked() []string {
result := []string{} result := []string{}
for w, ws := range t.warnableVal { for w, ws := range t.warnableVal {
if !w.IsVisible(ws) { if !w.IsVisible(ws, t.now) {
// Do not append invisible warnings. // Do not append invisible warnings.
continue continue
} }
@ -1010,7 +1028,7 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
t.setHealthyLocked(localLogWarnable) t.setHealthyLocked(localLogWarnable)
} }
now := time.Now() now := t.now()
// How long we assume we'll have heard a DERP frame or a MapResponse // How long we assume we'll have heard a DERP frame or a MapResponse
// KeepAlive by. // KeepAlive by.
@ -1020,8 +1038,10 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
recentlyOn := now.Sub(t.ipnWantRunningLastTrue) < 5*time.Second recentlyOn := now.Sub(t.ipnWantRunningLastTrue) < 5*time.Second
homeDERP := t.derpHomeRegion homeDERP := t.derpHomeRegion
if recentlyOn { if recentlyOn || !t.inMapPoll {
// If user just turned Tailscale on, don't warn for a bit. // If user just turned Tailscale on, don't warn for a bit.
// Also, if we're not in a map poll, that means we don't yet
// have a DERPMap or aren't in a state where we even want
t.setHealthyLocked(noDERPHomeWarnable) t.setHealthyLocked(noDERPHomeWarnable)
t.setHealthyLocked(noDERPConnectionWarnable) t.setHealthyLocked(noDERPConnectionWarnable)
t.setHealthyLocked(derpTimeoutWarnable) t.setHealthyLocked(derpTimeoutWarnable)
@ -1170,7 +1190,7 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
// updateWarmingUpWarnableLocked ensures the warmingUpWarnable is healthy if wantRunning has been set to true // updateWarmingUpWarnableLocked ensures the warmingUpWarnable is healthy if wantRunning has been set to true
// for more than warmingUpWarnableDuration. // for more than warmingUpWarnableDuration.
func (t *Tracker) updateWarmingUpWarnableLocked() { func (t *Tracker) updateWarmingUpWarnableLocked() {
if !t.ipnWantRunningLastTrue.IsZero() && time.Now().After(t.ipnWantRunningLastTrue.Add(warmingUpWarnableDuration)) { if !t.ipnWantRunningLastTrue.IsZero() && t.now().After(t.ipnWantRunningLastTrue.Add(warmingUpWarnableDuration)) {
t.setHealthyLocked(warmingUpWarnable) t.setHealthyLocked(warmingUpWarnable)
} }
} }
@ -1282,7 +1302,7 @@ func (t *Tracker) LastNoiseDialWasRecent() bool {
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
now := time.Now() now := t.now()
dur := now.Sub(t.lastNoiseDial) dur := now.Sub(t.lastNoiseDial)
t.lastNoiseDial = now t.lastNoiseDial = now
return dur < 2*time.Minute return dur < 2*time.Minute

View File

@ -12,6 +12,7 @@ import (
"time" "time"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/types/opt" "tailscale.com/types/opt"
"tailscale.com/util/usermetric" "tailscale.com/util/usermetric"
"tailscale.com/version" "tailscale.com/version"
@ -406,3 +407,47 @@ func TestHealthMetric(t *testing.T) {
}) })
} }
} }
// TestNoDERPHomeWarnable checks that we don't
// complain about no DERP home if we're not in a
// map poll.
func TestNoDERPHomeWarnable(t *testing.T) {
t.Skip("TODO: fix https://github.com/tailscale/tailscale/issues/14798 to make this test not deadlock")
clock := tstest.NewClock(tstest.ClockOpts{
Start: time.Unix(123, 0),
FollowRealTime: false,
})
ht := &Tracker{
testClock: clock,
}
ht.SetIPNState("NeedsLogin", true)
// Advance 30 seconds to get past the "recentlyLoggedIn" check.
clock.Advance(30 * time.Second)
ht.updateBuiltinWarnablesLocked()
// Advance to get past the the TimeToVisible delay.
clock.Advance(noDERPHomeWarnable.TimeToVisible * 2)
ht.updateBuiltinWarnablesLocked()
if ws, ok := ht.CurrentState().Warnings[noDERPHomeWarnable.Code]; ok {
t.Fatalf("got unexpected noDERPHomeWarnable warnable: %v", ws)
}
}
// TestNoDERPHomeWarnableManual is like TestNoDERPHomeWarnable
// but doesn't use tstest.Clock so avoids the deadlock
// I hit: https://github.com/tailscale/tailscale/issues/14798
func TestNoDERPHomeWarnableManual(t *testing.T) {
ht := &Tracker{}
ht.SetIPNState("NeedsLogin", true)
// Avoid wantRunning:
ht.ipnWantRunningLastTrue = ht.ipnWantRunningLastTrue.Add(-10 * time.Second)
ht.updateBuiltinWarnablesLocked()
ws, ok := ht.warnableVal[noDERPHomeWarnable]
if ok {
t.Fatalf("got unexpected noDERPHomeWarnable warnable: %v", ws)
}
}

View File

@ -86,7 +86,7 @@ func (t *Tracker) CurrentState() *State {
wm := map[WarnableCode]UnhealthyState{} wm := map[WarnableCode]UnhealthyState{}
for w, ws := range t.warnableVal { for w, ws := range t.warnableVal {
if !w.IsVisible(ws) { if !w.IsVisible(ws, t.now) {
// Skip invisible Warnables. // Skip invisible Warnables.
continue continue
} }