cmd/{containerboot,k8s-operator},kube/kubetypes: unadvertise ingress services on shutdown (#15451)

Ensure no services are advertised as part of shutting down tailscaled. Prefs are only edited if services are currently advertised, and they're edited we wait for control's ~15s (+ buffer) delay to failover. Note that editing prefs will trigger a synchronous write to the state Secret, so it may fail to persist state if the ProxyGroup is getting scaled down and therefore has its RBAC deleted at the same time, but that failure doesn't stop prefs being updated within the local backend, doesn't affect connectivity to control, and the state Secret is about to get deleted anyway, so the only negative side effect is a harmless error log during shutdown. Control still learns that the node is no longer advertising the service and triggers the failover. Note that the first version of this used a PreStop lifecycle hook, but that only supports GET methods and we need the shutdown to trigger side effects (updating prefs) so it didn't seem appropriate to expose that functionality on a GET endpoint that's accessible on the k8s network. Updates tailscale/corp#24795 Change-Id: I0a9a4fe7a5395ca76135ceead05cbc3ee32b3d3c Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
2025-12-01 09:32:08 +00:00 · 2025-04-09 10:11:15 +01:00
parent 8e1aa86bdb
commit dd95a83a65
5 changed files with 75 additions and 17 deletions
--- a/cmd/containerboot/healthz.go
+++ b/cmd/containerboot/healthz.go
@@ -47,10 +47,10 @@ func (h *healthz) update(healthy bool) {
 	h.hasAddrs = healthy
 }

-// healthHandlers registers a simple health handler at /healthz.
+// registerHealthHandlers registers a simple health handler at /healthz.
 // A containerized tailscale instance is considered healthy if
 // it has at least one tailnet IP address.
-func healthHandlers(mux *http.ServeMux, podIPv4 string) *healthz {
+func registerHealthHandlers(mux *http.ServeMux, podIPv4 string) *healthz {
 	h := &healthz{podIPv4: podIPv4}
 	mux.Handle("GET /healthz", h)
 	return h
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -195,18 +195,21 @@ func run() error {
 		return fmt.Errorf("failed to bring up tailscale: %w", err)
 	}
 	killTailscaled := func() {
+		// The default termination grace period for a Pod is 30s. We wait 25s at
+		// most so that we still reserve some of that budget for tailscaled
+		// to receive and react to a SIGTERM before the SIGKILL that k8s
+		// will send at the end of the grace period.
+		ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second)
+		defer cancel()
+
+		if err := ensureServicesNotAdvertised(ctx, client); err != nil {
+			log.Printf("Error ensuring services are not advertised: %v", err)
+		}
+
 		if hasKubeStateStore(cfg) {
 			// Check we're not shutting tailscaled down while it's still writing
 			// state. If we authenticate and fail to write all the state, we'll
 			// never recover automatically.
-			//
-			// The default termination grace period for a Pod is 30s. We wait 25s at
-			// most so that we still reserve some of that budget for tailscaled
-			// to receive and react to a SIGTERM before the SIGKILL that k8s
-			// will send at the end of the grace period.
-			ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second)
-			defer cancel()
-
 			log.Printf("Checking for consistent state")
 			err := kc.waitForConsistentState(ctx)
 			if err != nil {
@@ -226,7 +229,7 @@ func run() error {
 		mux := http.NewServeMux()

 		log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort)
-		healthCheck = healthHandlers(mux, cfg.PodIPv4)
+		healthCheck = registerHealthHandlers(mux, cfg.PodIPv4)

 		close := runHTTPServer(mux, cfg.HealthCheckAddrPort)
 		defer close()
@@ -237,15 +240,16 @@ func run() error {

 		if cfg.localMetricsEnabled() {
 			log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort)
-			metricsHandlers(mux, client, cfg.DebugAddrPort)
+			registerMetricsHandlers(mux, client, cfg.DebugAddrPort)
 		}

 		if cfg.localHealthEnabled() {
 			log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort)
-			healthCheck = healthHandlers(mux, cfg.PodIPv4)
+			healthCheck = registerHealthHandlers(mux, cfg.PodIPv4)
 		}
-		if cfg.EgressProxiesCfgPath != "" {
-			log.Printf("Running preshutdown hook at %s%s", cfg.LocalAddrPort, kubetypes.EgessServicesPreshutdownEP)
+
+		if cfg.egressSvcsTerminateEPEnabled() {
+			log.Printf("Running egress preshutdown hook at %s%s", cfg.LocalAddrPort, kubetypes.EgessServicesPreshutdownEP)
 			ep.registerHandlers(mux)
 		}

--- a/cmd/containerboot/metrics.go
+++ b/cmd/containerboot/metrics.go
@@ -62,13 +62,13 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) {
 	proxy(w, r, debugURL, http.DefaultClient.Do)
 }

-// metricsHandlers registers a simple HTTP metrics handler at /metrics, forwarding
+// registerMetricsHandlers registers a simple HTTP metrics handler at /metrics, forwarding
 // requests to tailscaled's /localapi/v0/usermetrics API.
 //
 // In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug
 // endpoint if configured to ease migration for a breaking change serving user
 // metrics instead of debug metrics on the "metrics" port.
-func metricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) {
+func registerMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) {
 	m := &metrics{
 		lc:            lc,
 		debugEndpoint: debugAddrPort,
--- a/cmd/containerboot/serve.go
+++ b/cmd/containerboot/serve.go
@@ -9,6 +9,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"fmt"
 	"log"
 	"os"
 	"path/filepath"
@@ -169,3 +170,46 @@ func readServeConfig(path, certDomain string) (*ipn.ServeConfig, error) {
 	}
 	return &sc, nil
 }
+
+func ensureServicesNotAdvertised(ctx context.Context, lc *local.Client) error {
+	prefs, err := lc.GetPrefs(ctx)
+	if err != nil {
+		return fmt.Errorf("error getting prefs: %w", err)
+	}
+	if len(prefs.AdvertiseServices) == 0 {
+		return nil
+	}
+
+	log.Printf("serve proxy: unadvertising services: %v", prefs.AdvertiseServices)
+	if _, err := lc.EditPrefs(ctx, &ipn.MaskedPrefs{
+		AdvertiseServicesSet: true,
+		Prefs: ipn.Prefs{
+			AdvertiseServices: nil,
+		},
+	}); err != nil {
+		// EditPrefs only returns an error if it fails _set_ its local prefs.
+		// If it fails to _persist_ the prefs in state, we don't get an error
+		// and we continue waiting below, as control will failover as usual.
+		return fmt.Errorf("error setting prefs AdvertiseServices: %w", err)
+	}
+
+	// Services use the same (failover XOR regional routing) mechanism that
+	// HA subnet routers use. Unfortunately we don't yet get a reliable signal
+	// from control that it's responded to our unadvertisement, so the best we
+	// can do is wait for 20 seconds, where 15s is the approximate maximum time
+	// it should take for control to choose a new primary, and 5s is for buffer.
+	//
+	// Note: There is no guarantee that clients have been _informed_ of the new
+	// primary no matter how long we wait. We would need a mechanism to await
+	// netmap updates for peers to know for sure.
+	//
+	// See https://tailscale.com/kb/1115/high-availability for more details.
+	// TODO(tomhjp): Wait for a netmap update instead of sleeping when control
+	// supports that.
+	select {
+	case <-ctx.Done():
+		return nil
+	case <-time.After(20 * time.Second):
+		return nil
+	}
+}
--- a/cmd/k8s-operator/proxygroup_specs.go
+++ b/cmd/k8s-operator/proxygroup_specs.go
@@ -197,6 +197,16 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string
 	// This mechanism currently (2025-01-26) rely on the local health check being accessible on the Pod's
 	// IP, so they are not supported for ProxyGroups where users have configured TS_LOCAL_ADDR_PORT to a custom
 	// value.
+	//
+	// NB: For _Ingress_ ProxyGroups, we run shutdown logic within containerboot
+	// in reaction to a SIGTERM signal instead of using a pre-stop hook. This is
+	// because Ingress pods need to unadvertise services, and it's preferable to
+	// avoid triggering those side-effects from a GET request that would be
+	// accessible to the whole cluster network (in the absence of NetworkPolicy
+	// rules).
+	//
+	// TODO(tomhjp): add a readiness probe or gate to Ingress Pods. There is a
+	// small window where the Pod is marked ready but routing can still fail.
 	if pg.Spec.Type == tsapi.ProxyGroupTypeEgress && !hasLocalAddrPortSet(proxyClass) {
 		c.Lifecycle = &corev1.Lifecycle{
 			PreStop: &corev1.LifecycleHandler{