From 2c8859c2e725af2de59203c0b2d39b96f135cb60 Mon Sep 17 00:00:00 2001 From: Irbe Krumina Date: Thu, 7 Nov 2024 19:27:53 +0000 Subject: [PATCH] client/tailscale,ipn/{ipnlocal,localapi}: add a pre-shutdown localAPI endpoint that terminates control connections. (#14028) Adds a /disconnect-control local API endpoint that just shuts down control client. This can be run before shutting down an HA subnet router/app connector replica - it will ensure that all connection to control are dropped and control thus considers this node inactive and tells peers to switch over to another replica. Meanwhile the existing connections keep working (assuming that the replica is given some graceful shutdown period). Updates tailscale/tailscale#14020 Signed-off-by: Irbe Krumina --- client/tailscale/localclient.go | 11 +++++++++++ ipn/ipnlocal/local.go | 13 +++++++++++++ ipn/localapi/localapi.go | 17 +++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/client/tailscale/localclient.go b/client/tailscale/localclient.go index 9c2bcc467..5eb668176 100644 --- a/client/tailscale/localclient.go +++ b/client/tailscale/localclient.go @@ -1327,6 +1327,17 @@ func (lc *LocalClient) SetServeConfig(ctx context.Context, config *ipn.ServeConf return nil } +// DisconnectControl shuts down all connections to control, thus making control consider this node inactive. This can be +// run on HA subnet router or app connector replicas before shutting them down to ensure peers get told to switch over +// to another replica whilst there is still some grace period for the existing connections to terminate. +func (lc *LocalClient) DisconnectControl(ctx context.Context) error { + _, _, err := lc.sendWithHeaders(ctx, "POST", "/localapi/v0/disconnect-control", 200, nil, nil) + if err != nil { + return fmt.Errorf("error disconnecting control: %w", err) + } + return nil +} + // NetworkLockDisable shuts down network-lock across the tailnet. func (lc *LocalClient) NetworkLockDisable(ctx context.Context, secret []byte) error { if _, err := lc.send(ctx, "POST", "/localapi/v0/tka/disable", 200, bytes.NewReader(secret)); err != nil { diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index edd56f7c4..337fa3d2b 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -800,6 +800,19 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() { b.cc.SetPaused((b.state == ipn.Stopped && b.netMap != nil) || (!networkUp && !testenv.InTest() && !assumeNetworkUpdateForTest())) } +// DisconnectControl shuts down control client. This can be run before node shutdown to force control to consider this ndoe +// inactive. This can be used to ensure that nodes that are HA subnet router or app connector replicas are shutting +// down, clients switch over to other replicas whilst the existing connections are kept alive for some period of time. +func (b *LocalBackend) DisconnectControl() { + b.mu.Lock() + defer b.mu.Unlock() + cc := b.resetControlClientLocked() + if cc == nil { + return + } + cc.Shutdown() +} + // captivePortalDetectionInterval is the duration to wait in an unhealthy state with connectivity broken // before running captive portal detection. const captivePortalDetectionInterval = 2 * time.Second diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index 0d41725d8..dc8c08975 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -100,6 +100,7 @@ "derpmap": (*Handler).serveDERPMap, "dev-set-state-store": (*Handler).serveDevSetStateStore, "dial": (*Handler).serveDial, + "disconnect-control": (*Handler).disconnectControl, "dns-osconfig": (*Handler).serveDNSOSConfig, "dns-query": (*Handler).serveDNSQuery, "drive/fileserver-address": (*Handler).serveDriveServerAddr, @@ -952,6 +953,22 @@ func (h *Handler) servePprof(w http.ResponseWriter, r *http.Request) { servePprofFunc(w, r) } +// disconnectControl is the handler for local API /disconnect-control endpoint that shuts down control client, so that +// node no longer communicates with control. Doing this makes control consider this node inactive. This can be used +// before shutting down a replica of HA subnet router or app connector deployments to ensure that control tells the +// peers to switch over to another replica whilst still maintaining th existing peer connections. +func (h *Handler) disconnectControl(w http.ResponseWriter, r *http.Request) { + if !h.PermitWrite { + http.Error(w, "access denied", http.StatusForbidden) + return + } + if r.Method != httpm.POST { + http.Error(w, "use POST", http.StatusMethodNotAllowed) + return + } + h.b.DisconnectControl() +} + func (h *Handler) reloadConfig(w http.ResponseWriter, r *http.Request) { if !h.PermitWrite { http.Error(w, "access denied", http.StatusForbidden)