control/controlclient: skip SetControlClientStatus when queue has newer results later

Updates #1909
Updates #12542
Updates tailscale/corp#26058

Change-Id: I3033d235ca49f9739fdf3deaf603eea4ec3e407e
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2025-01-24 13:09:21 -08:00 committed by Brad Fitzpatrick
parent cbf1a9abe1
commit 1a7274fccb
4 changed files with 167 additions and 2 deletions

View File

@ -21,6 +21,7 @@ import (
"tailscale.com/types/netmap"
"tailscale.com/types/persist"
"tailscale.com/types/structs"
"tailscale.com/util/clientmetric"
"tailscale.com/util/execqueue"
)
@ -131,6 +132,8 @@ type Auto struct {
// the server.
lastUpdateGen updateGen
lastStatus atomic.Pointer[Status]
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan bool // chans that gets sent true (once) on wake, or false on Shutdown
loggedIn bool // true if currently logged in
@ -596,21 +599,85 @@ func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkM
// not logged in.
nm = nil
}
new := Status{
newSt := &Status{
URL: url,
Persist: p,
NetMap: nm,
Err: err,
state: state,
}
c.lastStatus.Store(newSt)
// Launch a new goroutine to avoid blocking the caller while the observer
// does its thing, which may result in a call back into the client.
metricQueued.Add(1)
c.observerQueue.Add(func() {
c.observer.SetControlClientStatus(c, new)
if canSkipStatus(newSt, c.lastStatus.Load()) {
metricSkippable.Add(1)
if !c.direct.controlKnobs.DisableSkipStatusQueue.Load() {
metricSkipped.Add(1)
return
}
}
c.observer.SetControlClientStatus(c, *newSt)
// Best effort stop retaining the memory now that
// we've sent it to the observer (LocalBackend).
// We CAS here because the caller goroutine is
// doing a Store which we want to want to win
// a race. This is only a memory optimization
// and is for correctness:
c.lastStatus.CompareAndSwap(newSt, nil)
})
}
var (
metricQueued = clientmetric.NewCounter("controlclient_auto_status_queued")
metricSkippable = clientmetric.NewCounter("controlclient_auto_status_queue_skippable")
metricSkipped = clientmetric.NewCounter("controlclient_auto_status_queue_skipped")
)
// canSkipStatus reports whether we can skip sending s1, knowing
// that s2 is enqueued sometime in the future after s1.
//
// s1 must be non-nil. s2 may be nil.
func canSkipStatus(s1, s2 *Status) bool {
if s2 == nil {
// Nothing in the future.
return false
}
if s1 == s2 {
// If the last item in the queue is the same as s1,
// we can't skip it.
return false
}
if s1.Err != nil || s1.URL != "" {
// If s1 has an error or a URL, we shouldn't skip it, lest the error go
// away in s2 or in-between. We want to make sure all the subsystems see
// it. Plus there aren't many of these, so not worth skipping.
return false
}
if !s1.Persist.Equals(s2.Persist) || s1.state != s2.state {
// If s1 has a different Persist or state than s2,
// don't skip it. We only care about skipping the typical
// entries where the only difference is the NetMap.
return false
}
// If nothing above precludes it, and both s1 and s2 have NetMaps, then
// we can skip it, because s2's NetMap is a newer version and we can
// jump straight from whatever state we had before to s2's state,
// without passing through s1's state first. A NetMap is regrettably a
// full snapshot of the state, not an incremental delta. We're slowly
// moving towards passing around only deltas around internally at all
// layers, but this is explicitly the case where we didn't have a delta
// path for the message we received over the wire and had to resort
// to the legacy full NetMap path. And then we can get behind processing
// these full NetMap snapshots in LocalBackend/wgengine/magicsock/netstack
// and this path (when it returns true) lets us skip over useless work
// and not get behind in the queue. This matters in particular for tailnets
// that are both very large + very churny.
return s1.NetMap != nil && s2.NetMap != nil
}
func (c *Auto) Login(flags LoginFlags) {
c.logf("client.Login(%v)", flags)

View File

@ -4,8 +4,13 @@
package controlclient
import (
"io"
"reflect"
"slices"
"testing"
"tailscale.com/types/netmap"
"tailscale.com/types/persist"
)
func fieldsOf(t reflect.Type) (fields []string) {
@ -62,3 +67,83 @@ func TestStatusEqual(t *testing.T) {
}
}
}
// tests [canSkipStatus].
func TestCanSkipStatus(t *testing.T) {
st := new(Status)
nm1 := &netmap.NetworkMap{}
nm2 := &netmap.NetworkMap{}
tests := []struct {
name string
s1, s2 *Status
want bool
}{
{
name: "nil-s2",
s1: st,
s2: nil,
want: false,
},
{
name: "equal",
s1: st,
s2: st,
want: false,
},
{
name: "s1-error",
s1: &Status{Err: io.EOF, NetMap: nm1},
s2: &Status{NetMap: nm2},
want: false,
},
{
name: "s1-url",
s1: &Status{URL: "foo", NetMap: nm1},
s2: &Status{NetMap: nm2},
want: false,
},
{
name: "s1-persist-diff",
s1: &Status{Persist: new(persist.Persist).View(), NetMap: nm1},
s2: &Status{NetMap: nm2},
want: false,
},
{
name: "s1-state-diff",
s1: &Status{state: 123, NetMap: nm1},
s2: &Status{NetMap: nm2},
want: false,
},
{
name: "s1-no-netmap1",
s1: &Status{NetMap: nil},
s2: &Status{NetMap: nm2},
want: false,
},
{
name: "s1-no-netmap2",
s1: &Status{NetMap: nm1},
s2: &Status{NetMap: nil},
want: false,
},
{
name: "skip",
s1: &Status{NetMap: nm1},
s2: &Status{NetMap: nm2},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := canSkipStatus(tt.s1, tt.s2); got != tt.want {
t.Errorf("canSkipStatus = %v, want %v", got, tt.want)
}
})
}
want := []string{"Err", "URL", "NetMap", "Persist", "state"}
if f := fieldsOf(reflect.TypeFor[Status]()); !slices.Equal(f, want) {
t.Errorf("Status fields = %q; this code was only written to handle fields %q", f, want)
}
}

View File

@ -103,6 +103,11 @@ type Knobs struct {
// DisableCaptivePortalDetection is whether the node should not perform captive portal detection
// automatically when the network state changes.
DisableCaptivePortalDetection atomic.Bool
// DisableSkipStatusQueue is whether the node should disable skipping
// of queued netmap.NetworkMap between the controlclient and LocalBackend.
// See tailscale/tailscale#14768.
DisableSkipStatusQueue atomic.Bool
}
// UpdateFromNodeAttributes updates k (if non-nil) based on the provided self
@ -132,6 +137,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
disableLocalDNSOverrideViaNRPT = has(tailcfg.NodeAttrDisableLocalDNSOverrideViaNRPT)
disableCryptorouting = has(tailcfg.NodeAttrDisableMagicSockCryptoRouting)
disableCaptivePortalDetection = has(tailcfg.NodeAttrDisableCaptivePortalDetection)
disableSkipStatusQueue = has(tailcfg.NodeAttrDisableSkipStatusQueue)
)
if has(tailcfg.NodeAttrOneCGNATEnable) {
@ -159,6 +165,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
k.DisableLocalDNSOverrideViaNRPT.Store(disableLocalDNSOverrideViaNRPT)
k.DisableCryptorouting.Store(disableCryptorouting)
k.DisableCaptivePortalDetection.Store(disableCaptivePortalDetection)
k.DisableSkipStatusQueue.Store(disableSkipStatusQueue)
}
// AsDebugJSON returns k as something that can be marshalled with json.Marshal
@ -187,5 +194,6 @@ func (k *Knobs) AsDebugJSON() map[string]any {
"DisableLocalDNSOverrideViaNRPT": k.DisableLocalDNSOverrideViaNRPT.Load(),
"DisableCryptorouting": k.DisableCryptorouting.Load(),
"DisableCaptivePortalDetection": k.DisableCaptivePortalDetection.Load(),
"DisableSkipStatusQueue": k.DisableSkipStatusQueue.Load(),
}
}

View File

@ -2470,6 +2470,11 @@ const (
// automatically when the network state changes.
NodeAttrDisableCaptivePortalDetection NodeCapability = "disable-captive-portal-detection"
// NodeAttrDisableSkipStatusQueue is set when the node should disable skipping
// of queued netmap.NetworkMap between the controlclient and LocalBackend.
// See tailscale/tailscale#14768.
NodeAttrDisableSkipStatusQueue NodeCapability = "disable-skip-status-queue"
// NodeAttrSSHEnvironmentVariables enables logic for handling environment variables sent
// via SendEnv in the SSH server and applying them to the SSH session.
NodeAttrSSHEnvironmentVariables NodeCapability = "ssh-env-vars"