cmd/{k8s-operator,containerboot},kube: ensure egress ProxyGroup proxies don't terminate while cluster traffic is still routed to them (#14436)

cmd/{containerboot,k8s-operator},kube: add preshutdown hook for egress PG proxies This change is part of work towards minimizing downtime during update rollouts of egress ProxyGroup replicas. This change: - updates the containerboot health check logic to return Pod IP in headers, if set - always runs the health check for egress PG proxies - updates ClusterIP Services created for PG egress endpoints to include the health check endpoint - implements preshutdown endpoint in proxies. The preshutdown endpoint logic waits till, for all currently configured egress services, the ClusterIP Service health check endpoint is no longer returned by the shutting-down Pod (by looking at the new Pod IP header). - ensures that kubelet is configured to call the preshutdown endpoint This reduces the possibility that, as replicas are terminated during an update, a replica gets terminated to which cluster traffic is still being routed via the ClusterIP Service because kube proxy has not yet updated routig rules. This is not a perfect check as in practice, it only checks that the kube proxy on the node on which the proxy runs has updated rules. However, overall this might be good enough. The preshutdown logic is disabled if users have configured a custom health check port via TS_LOCAL_ADDR_PORT env var. This change throws a warnign if so and in future setting of that env var for operator proxies might be disallowed (as users shouldn't need to configure this for a Pod directly). This is backwards compatible with earlier proxy versions. Updates tailscale/tailscale#14326 Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2025-12-04 20:09:03 +00:00 · 2025-01-29 09:35:50 +02:00
parent eb299302ba
commit b406f209c3
18 changed files with 791 additions and 176 deletions
--- a/cmd/containerboot/healthz.go
+++ b/cmd/containerboot/healthz.go
@@ -6,9 +6,12 @@
 package main

 import (
+	"fmt"
 	"log"
 	"net/http"
 	"sync"
+
+	"tailscale.com/kube/kubetypes"
 )

 // healthz is a simple health check server, if enabled it returns 200 OK if
@@ -17,6 +20,7 @@ import (
 type healthz struct {
 	sync.Mutex
 	hasAddrs bool
+	podIPv4  string
 }

 func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
@@ -24,7 +28,10 @@ func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	defer h.Unlock()

 	if h.hasAddrs {
-		w.Write([]byte("ok"))
+		w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4)
+		if _, err := w.Write([]byte("ok")); err != nil {
+			http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError)
+		}
 	} else {
 		http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
 	}
@@ -43,8 +50,8 @@ func (h *healthz) update(healthy bool) {
 // healthHandlers registers a simple health handler at /healthz.
 // A containerized tailscale instance is considered healthy if
 // it has at least one tailnet IP address.
-func healthHandlers(mux *http.ServeMux) *healthz {
-	h := &healthz{}
+func healthHandlers(mux *http.ServeMux, podIPv4 string) *healthz {
+	h := &healthz{podIPv4: podIPv4}
 	mux.Handle("GET /healthz", h)
 	return h
 }
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -191,17 +191,18 @@ func main() {
 	defer killTailscaled()

 	var healthCheck *healthz
+	ep := &egressProxy{}
 	if cfg.HealthCheckAddrPort != "" {
 		mux := http.NewServeMux()

 		log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort)
-		healthCheck = healthHandlers(mux)
+		healthCheck = healthHandlers(mux, cfg.PodIPv4)

 		close := runHTTPServer(mux, cfg.HealthCheckAddrPort)
 		defer close()
 	}

-	if cfg.localMetricsEnabled() || cfg.localHealthEnabled() {
+	if cfg.localMetricsEnabled() || cfg.localHealthEnabled() || cfg.egressSvcsTerminateEPEnabled() {
 		mux := http.NewServeMux()

 		if cfg.localMetricsEnabled() {
@@ -211,7 +212,11 @@ func main() {

 		if cfg.localHealthEnabled() {
 			log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort)
-			healthCheck = healthHandlers(mux)
+			healthCheck = healthHandlers(mux, cfg.PodIPv4)
+		}
+		if cfg.EgressProxiesCfgPath != "" {
+			log.Printf("Running preshutdown hook at %s%s", cfg.LocalAddrPort, kubetypes.EgessServicesPreshutdownEP)
+			ep.registerHandlers(mux)
 		}

 		close := runHTTPServer(mux, cfg.LocalAddrPort)
@@ -639,20 +644,21 @@ runLoop:
 					// will then continuously monitor the config file and netmap updates and
 					// reconfigure the firewall rules as needed. If any of its operations fail, it
 					// will crash this node.
-					if cfg.EgressSvcsCfgPath != "" {
-						log.Printf("configuring egress proxy using configuration file at %s", cfg.EgressSvcsCfgPath)
+					if cfg.EgressProxiesCfgPath != "" {
+						log.Printf("configuring egress proxy using configuration file at %s", cfg.EgressProxiesCfgPath)
 						egressSvcsNotify = make(chan ipn.Notify)
-						ep := egressProxy{
-							cfgPath:      cfg.EgressSvcsCfgPath,
+						opts := egressProxyRunOpts{
+							cfgPath:      cfg.EgressProxiesCfgPath,
 							nfr:          nfr,
 							kc:           kc,
+							tsClient:     client,
 							stateSecret:  cfg.KubeSecret,
 							netmapChan:   egressSvcsNotify,
 							podIPv4:      cfg.PodIPv4,
 							tailnetAddrs: addrs,
 						}
 						go func() {
-							if err := ep.run(ctx, n); err != nil {
+							if err := ep.run(ctx, n, opts); err != nil {
 								egressSvcsErrorChan <- err
 							}
 						}()
--- a/cmd/containerboot/main_test.go
+++ b/cmd/containerboot/main_test.go
@@ -32,6 +32,8 @@ import (
 	"golang.org/x/sys/unix"
 	"tailscale.com/ipn"
 	"tailscale.com/kube/egressservices"
+	"tailscale.com/kube/kubeclient"
+	"tailscale.com/kube/kubetypes"
 	"tailscale.com/tailcfg"
 	"tailscale.com/tstest"
 	"tailscale.com/types/netmap"
@@ -54,20 +56,9 @@ func TestContainerBoot(t *testing.T) {
 	defer kube.Close()

 	tailscaledConf := &ipn.ConfigVAlpha{AuthKey: ptr.To("foo"), Version: "alpha0"}
-	tailscaledConfBytes, err := json.Marshal(tailscaledConf)
-	if err != nil {
-		t.Fatalf("error unmarshaling tailscaled config: %v", err)
-	}
 	serveConf := ipn.ServeConfig{TCP: map[uint16]*ipn.TCPPortHandler{80: {HTTP: true}}}
-	serveConfBytes, err := json.Marshal(serveConf)
-	if err != nil {
-		t.Fatalf("error unmarshaling serve config: %v", err)
-	}
-	egressSvcsCfg := egressservices.Configs{"foo": {TailnetTarget: egressservices.TailnetTarget{FQDN: "foo.tailnetxyx.ts.net"}}}
-	egressSvcsCfgBytes, err := json.Marshal(egressSvcsCfg)
-	if err != nil {
-		t.Fatalf("error unmarshaling egress services config: %v", err)
-	}
+	egressCfg := egressSvcConfig("foo", "foo.tailnetxyz.ts.net")
+	egressStatus := egressSvcStatus("foo", "foo.tailnetxyz.ts.net")

 	dirs := []string{
 		"var/lib",
@@ -84,16 +75,17 @@ func TestContainerBoot(t *testing.T) {
 		}
 	}
 	files := map[string][]byte{
-		"usr/bin/tailscaled":                         fakeTailscaled,
-		"usr/bin/tailscale":                          fakeTailscale,
-		"usr/bin/iptables":                           fakeTailscale,
-		"usr/bin/ip6tables":                          fakeTailscale,
-		"dev/net/tun":                                []byte(""),
-		"proc/sys/net/ipv4/ip_forward":               []byte("0"),
-		"proc/sys/net/ipv6/conf/all/forwarding":      []byte("0"),
-		"etc/tailscaled/cap-95.hujson":               tailscaledConfBytes,
-		"etc/tailscaled/serve-config.json":           serveConfBytes,
-		"etc/tailscaled/egress-services-config.json": egressSvcsCfgBytes,
+		"usr/bin/tailscaled":                    fakeTailscaled,
+		"usr/bin/tailscale":                     fakeTailscale,
+		"usr/bin/iptables":                      fakeTailscale,
+		"usr/bin/ip6tables":                     fakeTailscale,
+		"dev/net/tun":                           []byte(""),
+		"proc/sys/net/ipv4/ip_forward":          []byte("0"),
+		"proc/sys/net/ipv6/conf/all/forwarding": []byte("0"),
+		"etc/tailscaled/cap-95.hujson":          mustJSON(t, tailscaledConf),
+		"etc/tailscaled/serve-config.json":      mustJSON(t, serveConf),
+		filepath.Join("etc/tailscaled/", egressservices.KeyEgressServices): mustJSON(t, egressCfg),
+		filepath.Join("etc/tailscaled/", egressservices.KeyHEPPings):       []byte("4"),
 	}
 	resetFiles := func() {
 		for path, content := range files {
@@ -132,6 +124,9 @@ func TestContainerBoot(t *testing.T) {
 	healthURL := func(port int) string {
 		return fmt.Sprintf("http://127.0.0.1:%d/healthz", port)
 	}
+	egressSvcTerminateURL := func(port int) string {
+		return fmt.Sprintf("http://127.0.0.1:%d%s", port, kubetypes.EgessServicesPreshutdownEP)
+	}

 	capver := fmt.Sprintf("%d", tailcfg.CurrentCapabilityVersion)

@@ -896,9 +891,10 @@ func TestContainerBoot(t *testing.T) {
 		{
 			Name: "egress_svcs_config_kube",
 			Env: map[string]string{
-				"KUBERNETES_SERVICE_HOST":        kube.Host,
-				"KUBERNETES_SERVICE_PORT_HTTPS":  kube.Port,
-				"TS_EGRESS_SERVICES_CONFIG_PATH": filepath.Join(d, "etc/tailscaled/egress-services-config.json"),
+				"KUBERNETES_SERVICE_HOST":       kube.Host,
+				"KUBERNETES_SERVICE_PORT_HTTPS": kube.Port,
+				"TS_EGRESS_PROXIES_CONFIG_PATH": filepath.Join(d, "etc/tailscaled"),
+				"TS_LOCAL_ADDR_PORT":            fmt.Sprintf("[::]:%d", localAddrPort),
 			},
 			KubeSecret: map[string]string{
 				"authkey": "tskey-key",
@@ -912,28 +908,35 @@ func TestContainerBoot(t *testing.T) {
 					WantKubeSecret: map[string]string{
 						"authkey": "tskey-key",
 					},
+					EndpointStatuses: map[string]int{
+						egressSvcTerminateURL(localAddrPort): 200,
+					},
 				},
 				{
 					Notify: runningNotify,
 					WantKubeSecret: map[string]string{
+						"egress-services":  mustBase64(t, egressStatus),
 						"authkey":          "tskey-key",
 						"device_fqdn":      "test-node.test.ts.net",
 						"device_id":        "myID",
 						"device_ips":       `["100.64.0.1"]`,
 						"tailscale_capver": capver,
 					},
+					EndpointStatuses: map[string]int{
+						egressSvcTerminateURL(localAddrPort): 200,
+					},
 				},
 			},
 		},
 		{
 			Name: "egress_svcs_config_no_kube",
 			Env: map[string]string{
-				"TS_EGRESS_SERVICES_CONFIG_PATH": filepath.Join(d, "etc/tailscaled/egress-services-config.json"),
-				"TS_AUTHKEY":                     "tskey-key",
+				"TS_EGRESS_PROXIES_CONFIG_PATH": filepath.Join(d, "etc/tailscaled"),
+				"TS_AUTHKEY":                    "tskey-key",
 			},
 			Phases: []phase{
 				{
-					WantFatalLog: "TS_EGRESS_SERVICES_CONFIG_PATH is only supported for Tailscale running on Kubernetes",
+					WantFatalLog: "TS_EGRESS_PROXIES_CONFIG_PATH is only supported for Tailscale running on Kubernetes",
 				},
 			},
 		},
@@ -1394,13 +1397,31 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
 				panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
 			}
 			for _, op := range req {
-				if op.Op != "remove" {
+				if op.Op == "remove" {
+					if !strings.HasPrefix(op.Path, "/data/") {
+						panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
+					}
+					delete(k.secret, strings.TrimPrefix(op.Path, "/data/"))
+				} else if op.Op == "replace" {
+					path, ok := strings.CutPrefix(op.Path, "/data/")
+					if !ok {
+						panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
+					}
+					req := make([]kubeclient.JSONPatch, 0)
+					if err := json.Unmarshal(bs, &req); err != nil {
+						panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
+					}
+
+					for _, patch := range req {
+						val, ok := patch.Value.(string)
+						if !ok {
+							panic(fmt.Sprintf("unsupported json patch value %v: cannot be converted to string", patch.Value))
+						}
+						k.secret[path] = val
+					}
+				} else {
 					panic(fmt.Sprintf("unsupported json-patch op %q", op.Op))
 				}
-				if !strings.HasPrefix(op.Path, "/data/") {
-					panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
-				}
-				delete(k.secret, strings.TrimPrefix(op.Path, "/data/"))
 			}
 		case "application/strategic-merge-patch+json":
 			req := struct {
@@ -1419,3 +1440,41 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
 		panic(fmt.Sprintf("unhandled HTTP method %q", r.Method))
 	}
 }
+
+func mustBase64(t *testing.T, v any) string {
+	b := mustJSON(t, v)
+	s := base64.StdEncoding.WithPadding('=').EncodeToString(b)
+	return s
+}
+
+func mustJSON(t *testing.T, v any) []byte {
+	b, err := json.Marshal(v)
+	if err != nil {
+		t.Fatalf("error converting %v to json: %v", v, err)
+	}
+	return b
+}
+
+// egress services status given one named tailnet target specified by FQDN. As written by the proxy to its state Secret.
+func egressSvcStatus(name, fqdn string) egressservices.Status {
+	return egressservices.Status{
+		Services: map[string]*egressservices.ServiceStatus{
+			name: {
+				TailnetTarget: egressservices.TailnetTarget{
+					FQDN: fqdn,
+				},
+			},
+		},
+	}
+}
+
+// egress config given one named tailnet target specified by FQDN.
+func egressSvcConfig(name, fqdn string) egressservices.Configs {
+	return egressservices.Configs{
+		name: egressservices.Config{
+			TailnetTarget: egressservices.TailnetTarget{
+				FQDN: fqdn,
+			},
+		},
+	}
+}
--- a/cmd/containerboot/services.go
+++ b/cmd/containerboot/services.go
@@ -11,18 +11,24 @@ import (
 	"errors"
 	"fmt"
 	"log"
+	"net/http"
 	"net/netip"
 	"os"
 	"path/filepath"
 	"reflect"
+	"strconv"
 	"strings"
 	"time"

 	"github.com/fsnotify/fsnotify"
+	"tailscale.com/client/tailscale"
 	"tailscale.com/ipn"
 	"tailscale.com/kube/egressservices"
 	"tailscale.com/kube/kubeclient"
+	"tailscale.com/kube/kubetypes"
+	"tailscale.com/syncs"
 	"tailscale.com/tailcfg"
+	"tailscale.com/util/httpm"
 	"tailscale.com/util/linuxfw"
 	"tailscale.com/util/mak"
 )
@@ -37,13 +43,15 @@ const tailscaleTunInterface = "tailscale0"
 // egressProxy knows how to configure firewall rules to route cluster traffic to
 // one or more tailnet services.
 type egressProxy struct {
-	cfgPath string // path to egress service config file
+	cfgPath string // path to a directory with egress services config files

 	nfr linuxfw.NetfilterRunner // never nil

 	kc          kubeclient.Client // never nil
 	stateSecret string            // name of the kube state Secret

+	tsClient *tailscale.LocalClient // never nil
+
 	netmapChan chan ipn.Notify // chan to receive netmap updates on

 	podIPv4 string // never empty string, currently only IPv4 is supported
@@ -55,15 +63,29 @@ type egressProxy struct {
 	// memory at all.
 	targetFQDNs map[string][]netip.Prefix

-	// used to configure firewall rules.
-	tailnetAddrs []netip.Prefix
+	tailnetAddrs []netip.Prefix // tailnet IPs of this tailnet device
+
+	// shortSleep is the backoff sleep between healthcheck endpoint calls - can be overridden in tests.
+	shortSleep time.Duration
+	// longSleep is the time to sleep after the routing rules are updated to increase the chance that kube
+	// proxies on all nodes have updated their routing configuration. It can be configured to 0 in
+	// tests.
+	longSleep time.Duration
+	// client is a client that can send HTTP requests.
+	client httpClient
+}
+
+// httpClient is a client that can send HTTP requests and can be mocked in tests.
+type httpClient interface {
+	Do(*http.Request) (*http.Response, error)
 }

 // run configures egress proxy firewall rules and ensures that the firewall rules are reconfigured when:
 // - the mounted egress config has changed
 // - the proxy's tailnet IP addresses have changed
 // - tailnet IPs have changed for any backend targets specified by tailnet FQDN
-func (ep *egressProxy) run(ctx context.Context, n ipn.Notify) error {
+func (ep *egressProxy) run(ctx context.Context, n ipn.Notify, opts egressProxyRunOpts) error {
+	ep.configure(opts)
 	var tickChan <-chan time.Time
 	var eventChan <-chan fsnotify.Event
 	// TODO (irbekrm): take a look if this can be pulled into a single func
@@ -75,7 +97,7 @@ func (ep *egressProxy) run(ctx context.Context, n ipn.Notify) error {
 		tickChan = ticker.C
 	} else {
 		defer w.Close()
-		if err := w.Add(filepath.Dir(ep.cfgPath)); err != nil {
+		if err := w.Add(ep.cfgPath); err != nil {
 			return fmt.Errorf("failed to add fsnotify watch: %w", err)
 		}
 		eventChan = w.Events
@@ -85,28 +107,52 @@ func (ep *egressProxy) run(ctx context.Context, n ipn.Notify) error {
 		return err
 	}
 	for {
-		var err error
 		select {
 		case <-ctx.Done():
 			return nil
 		case <-tickChan:
-			err = ep.sync(ctx, n)
+			log.Printf("periodic sync, ensuring firewall config is up to date...")
 		case <-eventChan:
 			log.Printf("config file change detected, ensuring firewall config is up to date...")
-			err = ep.sync(ctx, n)
 		case n = <-ep.netmapChan:
 			shouldResync := ep.shouldResync(n)
-			if shouldResync {
-				log.Printf("netmap change detected, ensuring firewall config is up to date...")
-				err = ep.sync(ctx, n)
+			if !shouldResync {
+				continue
 			}
+			log.Printf("netmap change detected, ensuring firewall config is up to date...")
 		}
-		if err != nil {
+		if err := ep.sync(ctx, n); err != nil {
 			return fmt.Errorf("error syncing egress service config: %w", err)
 		}
 	}
 }

+type egressProxyRunOpts struct {
+	cfgPath      string
+	nfr          linuxfw.NetfilterRunner
+	kc           kubeclient.Client
+	tsClient     *tailscale.LocalClient
+	stateSecret  string
+	netmapChan   chan ipn.Notify
+	podIPv4      string
+	tailnetAddrs []netip.Prefix
+}
+
+// applyOpts configures egress proxy using the provided options.
+func (ep *egressProxy) configure(opts egressProxyRunOpts) {
+	ep.cfgPath = opts.cfgPath
+	ep.nfr = opts.nfr
+	ep.kc = opts.kc
+	ep.tsClient = opts.tsClient
+	ep.stateSecret = opts.stateSecret
+	ep.netmapChan = opts.netmapChan
+	ep.podIPv4 = opts.podIPv4
+	ep.tailnetAddrs = opts.tailnetAddrs
+	ep.client = &http.Client{} // default HTTP client
+	ep.shortSleep = time.Second
+	ep.longSleep = time.Second * 10
+}
+
 // sync triggers an egress proxy config resync. The resync calculates the diff between config and status to determine if
 // any firewall rules need to be updated. Currently using status in state Secret as a reference for what is the current
 // firewall configuration is good enough because - the status is keyed by the Pod IP - we crash the Pod on errors such
@@ -327,7 +373,8 @@ func (ep *egressProxy) deleteUnnecessaryServices(cfgs *egressservices.Configs, s

 // getConfigs gets the mounted egress service configuration.
 func (ep *egressProxy) getConfigs() (*egressservices.Configs, error) {
-	j, err := os.ReadFile(ep.cfgPath)
+	svcsCfg := filepath.Join(ep.cfgPath, egressservices.KeyEgressServices)
+	j, err := os.ReadFile(svcsCfg)
 	if os.IsNotExist(err) {
 		return nil, nil
 	}
@@ -569,3 +616,142 @@ func servicesStatusIsEqual(st, st1 *egressservices.Status) bool {
 	st1.PodIPv4 = ""
 	return reflect.DeepEqual(*st, *st1)
 }
+
+// registerHandlers adds a new handler to the provided ServeMux that can be called as a Kubernetes prestop hook to
+// delay shutdown till it's safe to do so.
+func (ep *egressProxy) registerHandlers(mux *http.ServeMux) {
+	mux.Handle(fmt.Sprintf("GET %s", kubetypes.EgessServicesPreshutdownEP), ep)
+}
+
+// ServeHTTP serves /internal-egress-services-preshutdown endpoint, when it receives a request, it periodically polls
+// the configured health check endpoint for each egress service till it the health check endpoint no longer hits this
+// proxy Pod. It uses the Pod-IPv4 header to verify if health check response is received from this Pod.
+func (ep *egressProxy) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	cfgs, err := ep.getConfigs()
+	if err != nil {
+		http.Error(w, fmt.Sprintf("error retrieving egress services configs: %v", err), http.StatusInternalServerError)
+		return
+	}
+	if cfgs == nil {
+		if _, err := w.Write([]byte("safe to terminate")); err != nil {
+			http.Error(w, fmt.Sprintf("error writing termination status: %v", err), http.StatusInternalServerError)
+			return
+		}
+	}
+	hp, err := ep.getHEPPings()
+	if err != nil {
+		http.Error(w, fmt.Sprintf("error determining the number of times health check endpoint should be pinged: %v", err), http.StatusInternalServerError)
+		return
+	}
+	ep.waitTillSafeToShutdown(r.Context(), cfgs, hp)
+}
+
+// waitTillSafeToShutdown looks up all egress targets configured to be proxied via this instance and, for each target
+// whose configuration includes a healthcheck endpoint, pings the endpoint till none of the responses
+// are returned by this instance or till the HTTP request times out. In practice, the endpoint will be a Kubernetes Service for whom one of the backends
+// would normally be this Pod. When this Pod is being deleted, the operator should have removed it from the Service
+// backends and eventually kube proxy routing rules should be updated to no longer route traffic for the Service to this
+// Pod.
+func (ep *egressProxy) waitTillSafeToShutdown(ctx context.Context, cfgs *egressservices.Configs, hp int) {
+	if cfgs == nil || len(*cfgs) == 0 { // avoid sleeping if no services are configured
+		return
+	}
+	log.Printf("Ensuring that cluster traffic for egress targets is no longer routed via this Pod...")
+	wg := syncs.WaitGroup{}
+
+	for s, cfg := range *cfgs {
+		hep := cfg.HealthCheckEndpoint
+		if hep == "" {
+			log.Printf("Tailnet target %q does not have a cluster healthcheck specified, unable to verify if cluster traffic for the target is still routed via this Pod", s)
+			continue
+		}
+		svc := s
+		wg.Go(func() {
+			log.Printf("Ensuring that cluster traffic is no longer routed to %q via this Pod...", svc)
+			for {
+				if ctx.Err() != nil { // kubelet's HTTP request timeout
+					log.Printf("Cluster traffic for %s did not stop being routed to this Pod.", svc)
+					return
+				}
+				found, err := lookupPodRoute(ctx, hep, ep.podIPv4, hp, ep.client)
+				if err != nil {
+					log.Printf("unable to reach endpoint %q, assuming the routing rules for this Pod have been deleted: %v", hep, err)
+					break
+				}
+				if !found {
+					log.Printf("service %q is no longer routed through this Pod", svc)
+					break
+				}
+				log.Printf("service %q is still routed through this Pod, waiting...", svc)
+				time.Sleep(ep.shortSleep)
+			}
+		})
+	}
+	wg.Wait()
+	// The check above really only checked that the routing rules are updated on this node. Sleep for a bit to
+	// ensure that the routing rules are updated on other nodes. TODO(irbekrm): this may or may not be good enough.
+	// If it's not good enough, we'd probably want to do something more complex, where the proxies check each other.
+	log.Printf("Sleeping for %s before shutdown to ensure that kube proxies on all nodes have updated routing configuration", ep.longSleep)
+	time.Sleep(ep.longSleep)
+}
+
+// lookupPodRoute calls the healthcheck endpoint repeat times and returns true if the endpoint returns with the podIP
+// header at least once.
+func lookupPodRoute(ctx context.Context, hep, podIP string, repeat int, client httpClient) (bool, error) {
+	for range repeat {
+		f, err := lookup(ctx, hep, podIP, client)
+		if err != nil {
+			return false, err
+		}
+		if f {
+			return true, nil
+		}
+	}
+	return false, nil
+}
+
+// lookup calls the healthcheck endpoint and returns true if the response contains the podIP header.
+func lookup(ctx context.Context, hep, podIP string, client httpClient) (bool, error) {
+	req, err := http.NewRequestWithContext(ctx, httpm.GET, hep, nil)
+	if err != nil {
+		return false, fmt.Errorf("error creating new HTTP request: %v", err)
+	}
+
+	// Close the TCP connection to ensure that the next request is routed to a different backend.
+	req.Close = true
+
+	resp, err := client.Do(req)
+	if err != nil {
+		log.Printf("Endpoint %q can not be reached: %v, likely because there are no (more) healthy backends", hep, err)
+		return true, nil
+	}
+	defer resp.Body.Close()
+	gotIP := resp.Header.Get(kubetypes.PodIPv4Header)
+	return strings.EqualFold(podIP, gotIP), nil
+}
+
+// getHEPPings gets the number of pings that should be sent to a health check endpoint to ensure that each configured
+// backend is hit. This assumes that a health check endpoint is a Kubernetes Service and traffic to backend Pods is
+// round robin load balanced.
+func (ep *egressProxy) getHEPPings() (int, error) {
+	hepPingsPath := filepath.Join(ep.cfgPath, egressservices.KeyHEPPings)
+	j, err := os.ReadFile(hepPingsPath)
+	if os.IsNotExist(err) {
+		return 0, nil
+	}
+	if err != nil {
+		return -1, err
+	}
+	if len(j) == 0 || string(j) == "" {
+		return 0, nil
+	}
+	hp, err := strconv.Atoi(string(j))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing hep pings as int: %v", err)
+	}
+	if hp < 0 {
+		log.Printf("[unexpected] hep pings is negative: %d", hp)
+		return 0, nil
+	}
+	return hp, nil
+}
--- a/cmd/containerboot/services_test.go
+++ b/cmd/containerboot/services_test.go
@@ -6,11 +6,18 @@
 package main

 import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
 	"net/netip"
 	"reflect"
+	"strings"
+	"sync"
 	"testing"

 	"tailscale.com/kube/egressservices"
+	"tailscale.com/kube/kubetypes"
 )

 func Test_updatesForSvc(t *testing.T) {
@@ -173,3 +180,145 @@ func Test_updatesForSvc(t *testing.T) {
 		})
 	}
 }
+
+// A failure of this test will most likely look like a timeout.
+func TestWaitTillSafeToShutdown(t *testing.T) {
+	podIP := "10.0.0.1"
+	anotherIP := "10.0.0.2"
+
+	tests := []struct {
+		name string
+		// services is a map of service name to the number of calls to make to the healthcheck endpoint before
+		// returning a response that does NOT contain this Pod's IP in headers.
+		services       map[string]int
+		replicas       int
+		healthCheckSet bool
+	}{
+		{
+			name: "no_configs",
+		},
+		{
+			name: "one_service_immediately_safe_to_shutdown",
+			services: map[string]int{
+				"svc1": 0,
+			},
+			replicas:       2,
+			healthCheckSet: true,
+		},
+		{
+			name: "multiple_services_immediately_safe_to_shutdown",
+			services: map[string]int{
+				"svc1": 0,
+				"svc2": 0,
+				"svc3": 0,
+			},
+			replicas:       2,
+			healthCheckSet: true,
+		},
+		{
+			name: "multiple_services_no_healthcheck_endpoints",
+			services: map[string]int{
+				"svc1": 0,
+				"svc2": 0,
+				"svc3": 0,
+			},
+			replicas: 2,
+		},
+		{
+			name: "one_service_eventually_safe_to_shutdown",
+			services: map[string]int{
+				"svc1": 3, // After 3 calls to health check endpoint, no longer returns this Pod's IP
+			},
+			replicas:       2,
+			healthCheckSet: true,
+		},
+		{
+			name: "multiple_services_eventually_safe_to_shutdown",
+			services: map[string]int{
+				"svc1": 1, // After 1 call to health check endpoint, no longer returns this Pod's IP
+				"svc2": 3, // After 3 calls to health check endpoint, no longer returns this Pod's IP
+				"svc3": 5, // After 5 calls to the health check endpoint, no longer returns this Pod's IP
+			},
+			replicas:       2,
+			healthCheckSet: true,
+		},
+		{
+			name: "multiple_services_eventually_safe_to_shutdown_with_higher_replica_count",
+			services: map[string]int{
+				"svc1": 7,
+				"svc2": 10,
+			},
+			replicas:       5,
+			healthCheckSet: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cfgs := &egressservices.Configs{}
+			switches := make(map[string]int)
+
+			for svc, callsToSwitch := range tt.services {
+				endpoint := fmt.Sprintf("http://%s.local", svc)
+				if tt.healthCheckSet {
+					(*cfgs)[svc] = egressservices.Config{
+						HealthCheckEndpoint: endpoint,
+					}
+				}
+				switches[endpoint] = callsToSwitch
+			}
+
+			ep := &egressProxy{
+				podIPv4: podIP,
+				client: &mockHTTPClient{
+					podIP:     podIP,
+					anotherIP: anotherIP,
+					switches:  switches,
+				},
+			}
+
+			ep.waitTillSafeToShutdown(context.Background(), cfgs, tt.replicas)
+		})
+	}
+}
+
+// mockHTTPClient is a client that receives an HTTP call for an egress service endpoint and returns a response with an
+// IP address in a 'Pod-IPv4' header. It can be configured to return one IP address for N calls, then switch to another
+// IP address to simulate a scenario where an IP is eventually no longer a backend for an endpoint.
+// TODO(irbekrm): to test this more thoroughly, we should have the client take into account the number of replicas and
+// return as if traffic was round robin load balanced across different Pods.
+type mockHTTPClient struct {
+	// podIP - initial IP address to return, that matches the current proxy's IP address.
+	podIP     string
+	anotherIP string
+	// after how many calls to an endpoint, the client should start returning 'anotherIP' instead of 'podIP.
+	switches map[string]int
+	mu       sync.Mutex // protects the following
+	// calls tracks the number of calls received.
+	calls map[string]int
+}
+
+func (m *mockHTTPClient) Do(req *http.Request) (*http.Response, error) {
+	m.mu.Lock()
+	if m.calls == nil {
+		m.calls = make(map[string]int)
+	}
+
+	endpoint := req.URL.String()
+	m.calls[endpoint]++
+	calls := m.calls[endpoint]
+	m.mu.Unlock()
+
+	resp := &http.Response{
+		StatusCode: http.StatusOK,
+		Header:     make(http.Header),
+		Body:       io.NopCloser(strings.NewReader("")),
+	}
+
+	if calls <= m.switches[endpoint] {
+		resp.Header.Set(kubetypes.PodIPv4Header, m.podIP) // Pod is still routable
+	} else {
+		resp.Header.Set(kubetypes.PodIPv4Header, m.anotherIP) // Pod is no longer routable
+	}
+	return resp, nil
+}
--- a/cmd/containerboot/settings.go
+++ b/cmd/containerboot/settings.go
@@ -64,16 +64,16 @@ type settings struct {
 	// when setting up rules to proxy cluster traffic to cluster ingress
 	// target.
 	// Deprecated: use PodIPv4, PodIPv6 instead to support dual stack clusters
-	PodIP               string
-	PodIPv4             string
-	PodIPv6             string
-	PodUID              string
-	HealthCheckAddrPort string
-	LocalAddrPort       string
-	MetricsEnabled      bool
-	HealthCheckEnabled  bool
-	DebugAddrPort       string
-	EgressSvcsCfgPath   string
+	PodIP                string
+	PodIPv4              string
+	PodIPv6              string
+	PodUID               string
+	HealthCheckAddrPort  string
+	LocalAddrPort        string
+	MetricsEnabled       bool
+	HealthCheckEnabled   bool
+	DebugAddrPort        string
+	EgressProxiesCfgPath string
 }

 func configFromEnv() (*settings, error) {
@@ -107,7 +107,7 @@ func configFromEnv() (*settings, error) {
 		MetricsEnabled:                        defaultBool("TS_ENABLE_METRICS", false),
 		HealthCheckEnabled:                    defaultBool("TS_ENABLE_HEALTH_CHECK", false),
 		DebugAddrPort:                         defaultEnv("TS_DEBUG_ADDR_PORT", ""),
-		EgressSvcsCfgPath:                     defaultEnv("TS_EGRESS_SERVICES_CONFIG_PATH", ""),
+		EgressProxiesCfgPath:                  defaultEnv("TS_EGRESS_PROXIES_CONFIG_PATH", ""),
 		PodUID:                                defaultEnv("POD_UID", ""),
 	}
 	podIPs, ok := os.LookupEnv("POD_IPS")
@@ -186,7 +186,7 @@ func (s *settings) validate() error {
 			return fmt.Errorf("error parsing TS_HEALTHCHECK_ADDR_PORT value %q: %w", s.HealthCheckAddrPort, err)
 		}
 	}
-	if s.localMetricsEnabled() || s.localHealthEnabled() {
+	if s.localMetricsEnabled() || s.localHealthEnabled() || s.EgressProxiesCfgPath != "" {
 		if _, err := netip.ParseAddrPort(s.LocalAddrPort); err != nil {
 			return fmt.Errorf("error parsing TS_LOCAL_ADDR_PORT value %q: %w", s.LocalAddrPort, err)
 		}
@@ -199,8 +199,8 @@ func (s *settings) validate() error {
 	if s.HealthCheckEnabled && s.HealthCheckAddrPort != "" {
 		return errors.New("TS_HEALTHCHECK_ADDR_PORT is deprecated and will be removed in 1.82.0, use TS_ENABLE_HEALTH_CHECK and optionally TS_LOCAL_ADDR_PORT")
 	}
-	if s.EgressSvcsCfgPath != "" && !(s.InKubernetes && s.KubeSecret != "") {
-		return errors.New("TS_EGRESS_SERVICES_CONFIG_PATH is only supported for Tailscale running on Kubernetes")
+	if s.EgressProxiesCfgPath != "" && !(s.InKubernetes && s.KubeSecret != "") {
+		return errors.New("TS_EGRESS_PROXIES_CONFIG_PATH is only supported for Tailscale running on Kubernetes")
 	}
 	return nil
 }
@@ -291,7 +291,7 @@ func isOneStepConfig(cfg *settings) bool {
 // as an L3 proxy, proxying to an endpoint provided via one of the config env
 // vars.
 func isL3Proxy(cfg *settings) bool {
-	return cfg.ProxyTargetIP != "" || cfg.ProxyTargetDNSName != "" || cfg.TailnetTargetIP != "" || cfg.TailnetTargetFQDN != "" || cfg.AllowProxyingClusterTrafficViaIngress || cfg.EgressSvcsCfgPath != ""
+	return cfg.ProxyTargetIP != "" || cfg.ProxyTargetDNSName != "" || cfg.TailnetTargetIP != "" || cfg.TailnetTargetFQDN != "" || cfg.AllowProxyingClusterTrafficViaIngress || cfg.EgressProxiesCfgPath != ""
 }

 // hasKubeStateStore returns true if the state must be stored in a Kubernetes
@@ -308,6 +308,10 @@ func (cfg *settings) localHealthEnabled() bool {
 	return cfg.LocalAddrPort != "" && cfg.HealthCheckEnabled
 }

+func (cfg *settings) egressSvcsTerminateEPEnabled() bool {
+	return cfg.LocalAddrPort != "" && cfg.EgressProxiesCfgPath != ""
+}
+
 // defaultEnv returns the value of the given envvar name, or defVal if
 // unset.
 func defaultEnv(name, defVal string) string {