health: relax no-derp-home warnable to not fire if not in map poll

Fixes #14687

Change-Id: I05035df7e075e94dd39b2192bee34d878c15310d
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2025-01-27 14:21:25 +00:00 committed by Brad Fitzpatrick
parent bfde8079a0
commit bd9725c5f8
3 changed files with 88 additions and 23 deletions

View File

@ -22,6 +22,7 @@ import (
"tailscale.com/envknob"
"tailscale.com/metrics"
"tailscale.com/tailcfg"
"tailscale.com/tstime"
"tailscale.com/types/opt"
"tailscale.com/util/cibuild"
"tailscale.com/util/mak"
@ -73,6 +74,8 @@ type Tracker struct {
// mu should not be held during init.
initOnce sync.Once
testClock tstime.Clock // nil means use time.Now / tstime.StdClock{}
// mu guards everything that follows.
mu sync.Mutex
@ -80,13 +83,13 @@ type Tracker struct {
warnableVal map[*Warnable]*warningState
// pendingVisibleTimers contains timers for Warnables that are unhealthy, but are
// not visible to the user yet, because they haven't been unhealthy for TimeToVisible
pendingVisibleTimers map[*Warnable]*time.Timer
pendingVisibleTimers map[*Warnable]tstime.TimerController
// sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
// Deprecated: using Warnables should be preferred
sysErr map[Subsystem]error
watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes
timer *time.Timer
timer tstime.TimerController
latestVersion *tailcfg.ClientVersion // or nil
checkForUpdates bool
@ -115,6 +118,20 @@ type Tracker struct {
metricHealthMessage *metrics.MultiLabelMap[metricHealthMessageLabel]
}
func (t *Tracker) now() time.Time {
if t.testClock != nil {
return t.testClock.Now()
}
return time.Now()
}
func (t *Tracker) clock() tstime.Clock {
if t.testClock != nil {
return t.testClock
}
return tstime.StdClock{}
}
// Subsystem is the name of a subsystem whose health can be monitored.
//
// Deprecated: Registering a Warnable using Register() and updating its health state
@ -311,11 +328,11 @@ func (ws *warningState) Equal(other *warningState) bool {
// IsVisible returns whether the Warnable should be visible to the user, based on the TimeToVisible
// field of the Warnable and the BrokenSince time when the Warnable became unhealthy.
func (w *Warnable) IsVisible(ws *warningState) bool {
func (w *Warnable) IsVisible(ws *warningState, clockNow func() time.Time) bool {
if ws == nil || w.TimeToVisible == 0 {
return true
}
return time.Since(ws.BrokenSince) >= w.TimeToVisible
return clockNow().Sub(ws.BrokenSince) >= w.TimeToVisible
}
// SetMetricsRegistry sets up the metrics for the Tracker. It takes
@ -365,7 +382,7 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
// If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that
// BrokenSince time.
brokenSince := time.Now()
brokenSince := t.now()
if existingWS := t.warnableVal[w]; existingWS != nil {
brokenSince = existingWS.BrokenSince
}
@ -384,15 +401,15 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
// If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be
// executed immediately. Otherwise, the callback should be enqueued to run once the Warnable
// becomes visible.
if w.IsVisible(ws) {
if w.IsVisible(ws, t.now) {
go cb(w, w.unhealthyState(ws))
continue
}
// The time remaining until the Warnable will be visible to the user is the TimeToVisible
// minus the time that has already passed since the Warnable became unhealthy.
visibleIn := w.TimeToVisible - time.Since(brokenSince)
mak.Set(&t.pendingVisibleTimers, w, time.AfterFunc(visibleIn, func() {
visibleIn := w.TimeToVisible - t.now().Sub(brokenSince)
var tc tstime.TimerController = t.clock().AfterFunc(visibleIn, func() {
t.mu.Lock()
defer t.mu.Unlock()
// Check if the Warnable is still unhealthy, as it could have become healthy between the time
@ -401,7 +418,8 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
go cb(w, w.unhealthyState(ws))
delete(t.pendingVisibleTimers, w)
}
}))
})
mak.Set(&t.pendingVisibleTimers, w, tc)
}
}
}
@ -476,7 +494,7 @@ func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unre
}
handle := t.watchers.Add(cb)
if t.timer == nil {
t.timer = time.AfterFunc(time.Minute, t.timerSelfCheck)
t.timer = t.clock().AfterFunc(time.Minute, t.timerSelfCheck)
}
return func() {
t.mu.Lock()
@ -640,10 +658,10 @@ func (t *Tracker) GotStreamedMapResponse() {
}
t.mu.Lock()
defer t.mu.Unlock()
t.lastStreamedMapResponse = time.Now()
t.lastStreamedMapResponse = t.now()
if !t.inMapPoll {
t.inMapPoll = true
t.inMapPollSince = time.Now()
t.inMapPollSince = t.now()
}
t.selfCheckLocked()
}
@ -660,7 +678,7 @@ func (t *Tracker) SetOutOfPollNetMap() {
return
}
t.inMapPoll = false
t.lastMapPollEndedAt = time.Now()
t.lastMapPollEndedAt = t.now()
t.selfCheckLocked()
}
@ -702,7 +720,7 @@ func (t *Tracker) NoteMapRequestHeard(mr *tailcfg.MapRequest) {
// against SetMagicSockDERPHome and
// SetDERPRegionConnectedState
t.lastMapRequestHeard = time.Now()
t.lastMapRequestHeard = t.now()
t.selfCheckLocked()
}
@ -740,7 +758,7 @@ func (t *Tracker) NoteDERPRegionReceivedFrame(region int) {
}
t.mu.Lock()
defer t.mu.Unlock()
mak.Set(&t.derpRegionLastFrame, region, time.Now())
mak.Set(&t.derpRegionLastFrame, region, t.now())
t.selfCheckLocked()
}
@ -799,9 +817,9 @@ func (t *Tracker) SetIPNState(state string, wantRunning bool) {
// The first time we see wantRunning=true and it used to be false, it means the user requested
// the backend to start. We store this timestamp and use it to silence some warnings that are
// expected during startup.
t.ipnWantRunningLastTrue = time.Now()
t.ipnWantRunningLastTrue = t.now()
t.setUnhealthyLocked(warmingUpWarnable, nil)
time.AfterFunc(warmingUpWarnableDuration, func() {
t.clock().AfterFunc(warmingUpWarnableDuration, func() {
t.mu.Lock()
t.updateWarmingUpWarnableLocked()
t.mu.Unlock()
@ -938,7 +956,7 @@ func (t *Tracker) Strings() []string {
func (t *Tracker) stringsLocked() []string {
result := []string{}
for w, ws := range t.warnableVal {
if !w.IsVisible(ws) {
if !w.IsVisible(ws, t.now) {
// Do not append invisible warnings.
continue
}
@ -1010,7 +1028,7 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
t.setHealthyLocked(localLogWarnable)
}
now := time.Now()
now := t.now()
// How long we assume we'll have heard a DERP frame or a MapResponse
// KeepAlive by.
@ -1020,8 +1038,10 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
recentlyOn := now.Sub(t.ipnWantRunningLastTrue) < 5*time.Second
homeDERP := t.derpHomeRegion
if recentlyOn {
if recentlyOn || !t.inMapPoll {
// If user just turned Tailscale on, don't warn for a bit.
// Also, if we're not in a map poll, that means we don't yet
// have a DERPMap or aren't in a state where we even want
t.setHealthyLocked(noDERPHomeWarnable)
t.setHealthyLocked(noDERPConnectionWarnable)
t.setHealthyLocked(derpTimeoutWarnable)
@ -1170,7 +1190,7 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
// updateWarmingUpWarnableLocked ensures the warmingUpWarnable is healthy if wantRunning has been set to true
// for more than warmingUpWarnableDuration.
func (t *Tracker) updateWarmingUpWarnableLocked() {
if !t.ipnWantRunningLastTrue.IsZero() && time.Now().After(t.ipnWantRunningLastTrue.Add(warmingUpWarnableDuration)) {
if !t.ipnWantRunningLastTrue.IsZero() && t.now().After(t.ipnWantRunningLastTrue.Add(warmingUpWarnableDuration)) {
t.setHealthyLocked(warmingUpWarnable)
}
}
@ -1282,7 +1302,7 @@ func (t *Tracker) LastNoiseDialWasRecent() bool {
t.mu.Lock()
defer t.mu.Unlock()
now := time.Now()
now := t.now()
dur := now.Sub(t.lastNoiseDial)
t.lastNoiseDial = now
return dur < 2*time.Minute

View File

@ -12,6 +12,7 @@ import (
"time"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/types/opt"
"tailscale.com/util/usermetric"
"tailscale.com/version"
@ -406,3 +407,47 @@ func TestHealthMetric(t *testing.T) {
})
}
}
// TestNoDERPHomeWarnable checks that we don't
// complain about no DERP home if we're not in a
// map poll.
func TestNoDERPHomeWarnable(t *testing.T) {
t.Skip("TODO: fix https://github.com/tailscale/tailscale/issues/14798 to make this test not deadlock")
clock := tstest.NewClock(tstest.ClockOpts{
Start: time.Unix(123, 0),
FollowRealTime: false,
})
ht := &Tracker{
testClock: clock,
}
ht.SetIPNState("NeedsLogin", true)
// Advance 30 seconds to get past the "recentlyLoggedIn" check.
clock.Advance(30 * time.Second)
ht.updateBuiltinWarnablesLocked()
// Advance to get past the the TimeToVisible delay.
clock.Advance(noDERPHomeWarnable.TimeToVisible * 2)
ht.updateBuiltinWarnablesLocked()
if ws, ok := ht.CurrentState().Warnings[noDERPHomeWarnable.Code]; ok {
t.Fatalf("got unexpected noDERPHomeWarnable warnable: %v", ws)
}
}
// TestNoDERPHomeWarnableManual is like TestNoDERPHomeWarnable
// but doesn't use tstest.Clock so avoids the deadlock
// I hit: https://github.com/tailscale/tailscale/issues/14798
func TestNoDERPHomeWarnableManual(t *testing.T) {
ht := &Tracker{}
ht.SetIPNState("NeedsLogin", true)
// Avoid wantRunning:
ht.ipnWantRunningLastTrue = ht.ipnWantRunningLastTrue.Add(-10 * time.Second)
ht.updateBuiltinWarnablesLocked()
ws, ok := ht.warnableVal[noDERPHomeWarnable]
if ok {
t.Fatalf("got unexpected noDERPHomeWarnable warnable: %v", ws)
}
}

View File

@ -86,7 +86,7 @@ func (t *Tracker) CurrentState() *State {
wm := map[WarnableCode]UnhealthyState{}
for w, ws := range t.warnableVal {
if !w.IsVisible(ws) {
if !w.IsVisible(ws, t.now) {
// Skip invisible Warnables.
continue
}