diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index edd0ae29c..a5397594e 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -21,6 +21,7 @@ import ( "tailscale.com/types/netmap" "tailscale.com/types/persist" "tailscale.com/types/structs" + "tailscale.com/util/clientmetric" "tailscale.com/util/execqueue" ) @@ -131,6 +132,8 @@ type Auto struct { // the server. lastUpdateGen updateGen + lastStatus atomic.Pointer[Status] + paused bool // whether we should stop making HTTP requests unpauseWaiters []chan bool // chans that gets sent true (once) on wake, or false on Shutdown loggedIn bool // true if currently logged in @@ -596,21 +599,85 @@ func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkM // not logged in. nm = nil } - new := Status{ + newSt := &Status{ URL: url, Persist: p, NetMap: nm, Err: err, state: state, } + c.lastStatus.Store(newSt) // Launch a new goroutine to avoid blocking the caller while the observer // does its thing, which may result in a call back into the client. + metricQueued.Add(1) c.observerQueue.Add(func() { - c.observer.SetControlClientStatus(c, new) + if canSkipStatus(newSt, c.lastStatus.Load()) { + metricSkippable.Add(1) + if !c.direct.controlKnobs.DisableSkipStatusQueue.Load() { + metricSkipped.Add(1) + return + } + } + c.observer.SetControlClientStatus(c, *newSt) + // Best effort stop retaining the memory now that + // we've sent it to the observer (LocalBackend). + // We CAS here because the caller goroutine is + // doing a Store which we want to want to win + // a race. This is only a memory optimization + // and is for correctness: + c.lastStatus.CompareAndSwap(newSt, nil) }) } +var ( + metricQueued = clientmetric.NewCounter("controlclient_auto_status_queued") + metricSkippable = clientmetric.NewCounter("controlclient_auto_status_queue_skippable") + metricSkipped = clientmetric.NewCounter("controlclient_auto_status_queue_skipped") +) + +// canSkipStatus reports whether we can skip sending s1, knowing +// that s2 is enqueued sometime in the future after s1. +// +// s1 must be non-nil. s2 may be nil. +func canSkipStatus(s1, s2 *Status) bool { + if s2 == nil { + // Nothing in the future. + return false + } + if s1 == s2 { + // If the last item in the queue is the same as s1, + // we can't skip it. + return false + } + if s1.Err != nil || s1.URL != "" { + // If s1 has an error or a URL, we shouldn't skip it, lest the error go + // away in s2 or in-between. We want to make sure all the subsystems see + // it. Plus there aren't many of these, so not worth skipping. + return false + } + if !s1.Persist.Equals(s2.Persist) || s1.state != s2.state { + // If s1 has a different Persist or state than s2, + // don't skip it. We only care about skipping the typical + // entries where the only difference is the NetMap. + return false + } + // If nothing above precludes it, and both s1 and s2 have NetMaps, then + // we can skip it, because s2's NetMap is a newer version and we can + // jump straight from whatever state we had before to s2's state, + // without passing through s1's state first. A NetMap is regrettably a + // full snapshot of the state, not an incremental delta. We're slowly + // moving towards passing around only deltas around internally at all + // layers, but this is explicitly the case where we didn't have a delta + // path for the message we received over the wire and had to resort + // to the legacy full NetMap path. And then we can get behind processing + // these full NetMap snapshots in LocalBackend/wgengine/magicsock/netstack + // and this path (when it returns true) lets us skip over useless work + // and not get behind in the queue. This matters in particular for tailnets + // that are both very large + very churny. + return s1.NetMap != nil && s2.NetMap != nil +} + func (c *Auto) Login(flags LoginFlags) { c.logf("client.Login(%v)", flags) diff --git a/control/controlclient/controlclient_test.go b/control/controlclient/controlclient_test.go index b37623451..6885b5851 100644 --- a/control/controlclient/controlclient_test.go +++ b/control/controlclient/controlclient_test.go @@ -4,8 +4,13 @@ package controlclient import ( + "io" "reflect" + "slices" "testing" + + "tailscale.com/types/netmap" + "tailscale.com/types/persist" ) func fieldsOf(t reflect.Type) (fields []string) { @@ -62,3 +67,83 @@ func TestStatusEqual(t *testing.T) { } } } + +// tests [canSkipStatus]. +func TestCanSkipStatus(t *testing.T) { + st := new(Status) + nm1 := &netmap.NetworkMap{} + nm2 := &netmap.NetworkMap{} + + tests := []struct { + name string + s1, s2 *Status + want bool + }{ + { + name: "nil-s2", + s1: st, + s2: nil, + want: false, + }, + { + name: "equal", + s1: st, + s2: st, + want: false, + }, + { + name: "s1-error", + s1: &Status{Err: io.EOF, NetMap: nm1}, + s2: &Status{NetMap: nm2}, + want: false, + }, + { + name: "s1-url", + s1: &Status{URL: "foo", NetMap: nm1}, + s2: &Status{NetMap: nm2}, + want: false, + }, + { + name: "s1-persist-diff", + s1: &Status{Persist: new(persist.Persist).View(), NetMap: nm1}, + s2: &Status{NetMap: nm2}, + want: false, + }, + { + name: "s1-state-diff", + s1: &Status{state: 123, NetMap: nm1}, + s2: &Status{NetMap: nm2}, + want: false, + }, + { + name: "s1-no-netmap1", + s1: &Status{NetMap: nil}, + s2: &Status{NetMap: nm2}, + want: false, + }, + { + name: "s1-no-netmap2", + s1: &Status{NetMap: nm1}, + s2: &Status{NetMap: nil}, + want: false, + }, + { + name: "skip", + s1: &Status{NetMap: nm1}, + s2: &Status{NetMap: nm2}, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := canSkipStatus(tt.s1, tt.s2); got != tt.want { + t.Errorf("canSkipStatus = %v, want %v", got, tt.want) + } + }) + } + + want := []string{"Err", "URL", "NetMap", "Persist", "state"} + if f := fieldsOf(reflect.TypeFor[Status]()); !slices.Equal(f, want) { + t.Errorf("Status fields = %q; this code was only written to handle fields %q", f, want) + } +} diff --git a/control/controlknobs/controlknobs.go b/control/controlknobs/controlknobs.go index dd76a3abd..c7933be5a 100644 --- a/control/controlknobs/controlknobs.go +++ b/control/controlknobs/controlknobs.go @@ -103,6 +103,11 @@ type Knobs struct { // DisableCaptivePortalDetection is whether the node should not perform captive portal detection // automatically when the network state changes. DisableCaptivePortalDetection atomic.Bool + + // DisableSkipStatusQueue is whether the node should disable skipping + // of queued netmap.NetworkMap between the controlclient and LocalBackend. + // See tailscale/tailscale#14768. + DisableSkipStatusQueue atomic.Bool } // UpdateFromNodeAttributes updates k (if non-nil) based on the provided self @@ -132,6 +137,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { disableLocalDNSOverrideViaNRPT = has(tailcfg.NodeAttrDisableLocalDNSOverrideViaNRPT) disableCryptorouting = has(tailcfg.NodeAttrDisableMagicSockCryptoRouting) disableCaptivePortalDetection = has(tailcfg.NodeAttrDisableCaptivePortalDetection) + disableSkipStatusQueue = has(tailcfg.NodeAttrDisableSkipStatusQueue) ) if has(tailcfg.NodeAttrOneCGNATEnable) { @@ -159,6 +165,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { k.DisableLocalDNSOverrideViaNRPT.Store(disableLocalDNSOverrideViaNRPT) k.DisableCryptorouting.Store(disableCryptorouting) k.DisableCaptivePortalDetection.Store(disableCaptivePortalDetection) + k.DisableSkipStatusQueue.Store(disableSkipStatusQueue) } // AsDebugJSON returns k as something that can be marshalled with json.Marshal @@ -187,5 +194,6 @@ func (k *Knobs) AsDebugJSON() map[string]any { "DisableLocalDNSOverrideViaNRPT": k.DisableLocalDNSOverrideViaNRPT.Load(), "DisableCryptorouting": k.DisableCryptorouting.Load(), "DisableCaptivePortalDetection": k.DisableCaptivePortalDetection.Load(), + "DisableSkipStatusQueue": k.DisableSkipStatusQueue.Load(), } } diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 738c8a5dc..c17cd5f45 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -2470,6 +2470,11 @@ const ( // automatically when the network state changes. NodeAttrDisableCaptivePortalDetection NodeCapability = "disable-captive-portal-detection" + // NodeAttrDisableSkipStatusQueue is set when the node should disable skipping + // of queued netmap.NetworkMap between the controlclient and LocalBackend. + // See tailscale/tailscale#14768. + NodeAttrDisableSkipStatusQueue NodeCapability = "disable-skip-status-queue" + // NodeAttrSSHEnvironmentVariables enables logic for handling environment variables sent // via SendEnv in the SSH server and applying them to the SSH session. NodeAttrSSHEnvironmentVariables NodeCapability = "ssh-env-vars"