client/tailscale,ipn/{ipnlocal,localapi}: add a pre-shutdown localAPI endpoint that terminates control connections.

Adds a /lameduck local API endpoint that just shuts down control client.
This can be run before shutting down an HA subnet router/app connector replica - it will ensure
that all connection to control are dropped and control thus considers this node inactive and tells
peers to switch over to another replica. Meanwhile the existing connections keep working (assuming
that the replica is given some graceful shutdown period).

Updates tailscale/tailscale#14020

Signed-off-by: Irbe Krumina <irbe@tailscale.com>
This commit is contained in:
Irbe Krumina 2024-11-06 06:52:05 +00:00
parent 49de23cf1b
commit 7de684e71a
3 changed files with 41 additions and 0 deletions

View File

@ -1327,6 +1327,17 @@ func (lc *LocalClient) SetServeConfig(ctx context.Context, config *ipn.ServeConf
return nil
}
// LameDuck shuts down all connections to control, thus making control consider this node inactive. This can be run on
// HA subnet router or app connector replicas before shutting them down to ensure peers get told to switch over to
// another replica whilst there is still some grace period for the existing connections to terminate.
func (lc *LocalClient) LameDuck(ctx context.Context) error {
_, _, err := lc.sendWithHeaders(ctx, "POST", "/localapi/v0/lameduck", 200, nil, nil)
if err != nil {
return fmt.Errorf("error enabling lameduck mode: %w", err)
}
return nil
}
// NetworkLockDisable shuts down network-lock across the tailnet.
func (lc *LocalClient) NetworkLockDisable(ctx context.Context, secret []byte) error {
if _, err := lc.send(ctx, "POST", "/localapi/v0/tka/disable", 200, bytes.NewReader(secret)); err != nil {

View File

@ -780,6 +780,19 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() {
b.cc.SetPaused((b.state == ipn.Stopped && b.netMap != nil) || (!networkUp && !testenv.InTest() && !assumeNetworkUpdateForTest()))
}
// LameDuck shuts down control client. This can be run before node shutdown to force control to consider this ndoe
// inactive. This can be used to ensure that nodes that are HA subnet router or app connector replicas are shutting
// down, clients switch over to other replicas whilst the existing connections are kept alive for some period of time.
func (b *LocalBackend) LameDuck() {
b.mu.Lock()
defer b.mu.Unlock()
cc := b.resetControlClientLocked()
if cc == nil {
return
}
cc.Shutdown()
}
// captivePortalDetectionInterval is the duration to wait in an unhealthy state with connectivity broken
// before running captive portal detection.
const captivePortalDetectionInterval = 2 * time.Second

View File

@ -108,6 +108,7 @@
"goroutines": (*Handler).serveGoroutines,
"handle-push-message": (*Handler).serveHandlePushMessage,
"id-token": (*Handler).serveIDToken,
"lameduck": (*Handler).lameDuck,
"login-interactive": (*Handler).serveLoginInteractive,
"logout": (*Handler).serveLogout,
"logtap": (*Handler).serveLogTap,
@ -952,6 +953,22 @@ func (h *Handler) servePprof(w http.ResponseWriter, r *http.Request) {
servePprofFunc(w, r)
}
// lameDuck is the handler for local API /lameduck endpoint that shuts down control client, so that node no longer
// communicates with control. Doing this makes control consider this node inactive. This can be used before shutting
// down a replica of HA subnet router or app connector deployments to ensure that control tells the peers to switch
// over to another replica whilst still maintaining th existing peer connections.
func (h *Handler) lameDuck(w http.ResponseWriter, r *http.Request) {
if !h.PermitWrite {
http.Error(w, "access denied", http.StatusForbidden)
return
}
if r.Method != httpm.POST {
http.Error(w, "use POST", http.StatusMethodNotAllowed)
return
}
h.b.LameDuck()
}
func (h *Handler) reloadConfig(w http.ResponseWriter, r *http.Request) {
if !h.PermitWrite {
http.Error(w, "access denied", http.StatusForbidden)