ipn/ipnlocal: signal nodeBackend readiness and shutdown

We update LocalBackend to shut down the current nodeBackend when switching to a different node, and to mark the new node's nodeBackend as ready when the switch completes. Updates tailscale/corp#28014 Updates tailscale/corp#29543 Updates #12614 Signed-off-by: Nick Khyl <nickk@tailscale.com>
2025-08-21 10:27:30 +00:00 · 2025-06-13 12:51:40 -05:00
parent fe391d5694
commit 733bfaeffe
3 changed files with 230 additions and 16 deletions
--- a/ipn/ipnlocal/node_backend.go
+++ b/ipn/ipnlocal/node_backend.go
@@ -5,6 +5,7 @@ package ipnlocal

 import (
 	"cmp"
+	"context"
 	"net/netip"
 	"slices"
 	"sync"
@@ -39,7 +40,7 @@ import (
 // Two pointers to different [nodeBackend] instances represent different local nodes.
 // However, there's currently a bug where a new [nodeBackend] might not be created
 // during an implicit node switch (see tailscale/corp#28014).
-
+//
 // In the future, we might want to include at least the following in this struct (in addition to the current fields).
 // However, not everything should be exported or otherwise made available to the outside world (e.g. [ipnext] extensions,
 // peer API handlers, etc.).
@@ -61,6 +62,9 @@ import (
 // Even if they're tied to the local node, instead of moving them here, we should extract the entire feature
 // into a separate package and have it install proper hooks.
 type nodeBackend struct {
+	ctx       context.Context         // canceled by [nodeBackend.shutdown]
+	ctxCancel context.CancelCauseFunc // cancels ctx
+
 	// filterAtomic is a stateful packet filter. Immutable once created, but can be
 	// replaced with a new one.
 	filterAtomic atomic.Pointer[filter.Filter]
@@ -68,6 +72,9 @@ type nodeBackend struct {
 	// TODO(nickkhyl): maybe use sync.RWMutex?
 	mu sync.Mutex // protects the following fields

+	shutdownOnce sync.Once     // guards calling [nodeBackend.shutdown]
+	readyCh      chan struct{} // closed by [nodeBackend.ready]; nil after shutdown
+
 	// NetMap is the most recently set full netmap from the controlclient.
 	// It can't be mutated in place once set. Because it can't be mutated in place,
 	// delta updates from the control server don't apply to it. Instead, use
@@ -88,12 +95,24 @@ type nodeBackend struct {
 	nodeByAddr map[netip.Addr]tailcfg.NodeID
 }

-func newNodeBackend() *nodeBackend {
-	cn := &nodeBackend{}
+func newNodeBackend(ctx context.Context) *nodeBackend {
+	ctx, ctxCancel := context.WithCancelCause(ctx)
+	nb := &nodeBackend{
+		ctx:       ctx,
+		ctxCancel: ctxCancel,
+		readyCh:   make(chan struct{}),
+	}
 	// Default filter blocks everything and logs nothing.
 	noneFilter := filter.NewAllowNone(logger.Discard, &netipx.IPSet{})
-	cn.filterAtomic.Store(noneFilter)
-	return cn
+	nb.filterAtomic.Store(noneFilter)
+	return nb
+}
+
+// Context returns a context that is canceled when the [nodeBackend] shuts down,
+// either because [LocalBackend] is switching to a different [nodeBackend]
+// or is shutting down itself.
+func (nb *nodeBackend) Context() context.Context {
+	return nb.ctx
 }

 func (nb *nodeBackend) Self() tailcfg.NodeView {
@@ -475,6 +494,59 @@ func (nb *nodeBackend) exitNodeCanProxyDNS(exitNodeID tailcfg.StableNodeID) (doh
 	return exitNodeCanProxyDNS(nb.netMap, nb.peers, exitNodeID)
 }

+// ready signals that [LocalBackend] has completed the switch to this [nodeBackend]
+// and any pending calls to [nodeBackend.Wait] must be unblocked.
+func (nb *nodeBackend) ready() {
+	nb.mu.Lock()
+	defer nb.mu.Unlock()
+	if nb.readyCh != nil {
+		close(nb.readyCh)
+	}
+}
+
+// Wait blocks until [LocalBackend] completes the switch to this [nodeBackend]
+// and calls [nodeBackend.ready]. It returns an error if the provided context
+// is canceled or if the [nodeBackend] shuts down or is already shut down.
+//
+// It must not be called with the [LocalBackend]'s internal mutex held as [LocalBackend]
+// may need to acquire it to complete the switch.
+//
+// TODO(nickkhyl): Relax this restriction once [LocalBackend]'s state machine
+// runs in its own goroutine, or if we decide that waiting for the state machine
+// restart to finish isn't necessary for [LocalBackend] to consider the switch complete.
+// We mostly need this because of [LocalBackend.Start] acquiring b.mu and the fact that
+// methods like [LocalBackend.SwitchProfile] must report any errors returned by it.
+// Perhaps we could report those errors asynchronously as [health.Warnable]s?
+func (nb *nodeBackend) Wait(ctx context.Context) error {
+	nb.mu.Lock()
+	readyCh := nb.readyCh
+	nb.mu.Unlock()
+
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-nb.ctx.Done():
+		return context.Cause(nb.ctx)
+	case <-readyCh:
+		return nil
+	}
+}
+
+// shutdown shuts down the [nodeBackend] and cancels its context
+// with the provided cause.
+func (nb *nodeBackend) shutdown(cause error) {
+	nb.shutdownOnce.Do(func() {
+		nb.doShutdown(cause)
+	})
+}
+
+func (nb *nodeBackend) doShutdown(cause error) {
+	nb.mu.Lock()
+	defer nb.mu.Unlock()
+	nb.ctxCancel(cause)
+	nb.readyCh = nil
+}
+
 // dnsConfigForNetmap returns a *dns.Config for the given netmap,
 // prefs, client OS version, and cloud hosting environment.
 //