cmd/{k8s-operator,containerboot},kube: ensure egress ProxyGroup proxies don't terminate while cluster traffic is still routed to them (#14436)

cmd/{containerboot,k8s-operator},kube: add preshutdown hook for egress PG proxies This change is part of work towards minimizing downtime during update rollouts of egress ProxyGroup replicas. This change: - updates the containerboot health check logic to return Pod IP in headers, if set - always runs the health check for egress PG proxies - updates ClusterIP Services created for PG egress endpoints to include the health check endpoint - implements preshutdown endpoint in proxies. The preshutdown endpoint logic waits till, for all currently configured egress services, the ClusterIP Service health check endpoint is no longer returned by the shutting-down Pod (by looking at the new Pod IP header). - ensures that kubelet is configured to call the preshutdown endpoint This reduces the possibility that, as replicas are terminated during an update, a replica gets terminated to which cluster traffic is still being routed via the ClusterIP Service because kube proxy has not yet updated routig rules. This is not a perfect check as in practice, it only checks that the kube proxy on the node on which the proxy runs has updated rules. However, overall this might be good enough. The preshutdown logic is disabled if users have configured a custom health check port via TS_LOCAL_ADDR_PORT env var. This change throws a warnign if so and in future setting of that env var for operator proxies might be disallowed (as users shouldn't need to configure this for a Pod directly). This is backwards compatible with earlier proxy versions. Updates tailscale/tailscale#14326 Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2025-10-09 08:01:31 +00:00 · 2025-01-29 09:35:50 +02:00
parent eb299302ba
commit b406f209c3
18 changed files with 791 additions and 176 deletions
--- a/cmd/k8s-operator/egress-eps.go
+++ b/cmd/k8s-operator/egress-eps.go
@@ -20,7 +20,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
-	tsoperator "tailscale.com/k8s-operator"
 	"tailscale.com/kube/egressservices"
 	"tailscale.com/types/ptr"
 )
@@ -71,25 +70,27 @@ func (er *egressEpsReconciler) Reconcile(ctx context.Context, req reconcile.Requ
 	if err != nil {
 		return res, fmt.Errorf("error retrieving ExternalName Service: %w", err)
 	}
-	if !tsoperator.EgressServiceIsValidAndConfigured(svc) {
-		l.Infof("Cluster resources for ExternalName Service %s/%s are not yet configured", svc.Namespace, svc.Name)
-		return res, nil
-	}

 	// TODO(irbekrm): currently this reconcile loop runs all the checks every time it's triggered, which is
 	// wasteful. Once we have a Ready condition for ExternalName Services for ProxyGroup, use the condition to
 	// determine if a reconcile is needed.

 	oldEps := eps.DeepCopy()
-	proxyGroupName := eps.Labels[labelProxyGroup]
 	tailnetSvc := tailnetSvcName(svc)
 	l = l.With("tailnet-service-name", tailnetSvc)

 	// Retrieve the desired tailnet service configuration from the ConfigMap.
+	proxyGroupName := eps.Labels[labelProxyGroup]
 	_, cfgs, err := egressSvcsConfigs(ctx, er.Client, proxyGroupName, er.tsNamespace)
 	if err != nil {
 		return res, fmt.Errorf("error retrieving tailnet services configuration: %w", err)
 	}
+	if cfgs == nil {
+		// TODO(irbekrm): this path would be hit if egress service was once exposed on a ProxyGroup that later
+		// got deleted. Probably the EndpointSlices then need to be deleted too- need to rethink this flow.
+		l.Debugf("No egress config found, likely because ProxyGroup has not been created")
+		return res, nil
+	}
 	cfg, ok := (*cfgs)[tailnetSvc]
 	if !ok {
 		l.Infof("[unexpected] configuration for tailnet service %s not found", tailnetSvc)
--- a/cmd/k8s-operator/egress-services.go
+++ b/cmd/k8s-operator/egress-services.go
@@ -59,6 +59,8 @@ const (
 	maxPorts = 1000

 	indexEgressProxyGroup = ".metadata.annotations.egress-proxy-group"
+
+	tsHealthCheckPortName = "tailscale-health-check"
 )

 var gaugeEgressServices = clientmetric.NewGauge(kubetypes.MetricEgressServiceCount)
@@ -229,15 +231,16 @@ func (esr *egressSvcsReconciler) provision(ctx context.Context, proxyGroupName s
 		found := false
 		for _, wantsPM := range svc.Spec.Ports {
 			if wantsPM.Port == pm.Port && strings.EqualFold(string(wantsPM.Protocol), string(pm.Protocol)) {
-				// We don't use the port name to distinguish this port internally, but Kubernetes
-				// require that, for Service ports with more than one name each port is uniquely named.
-				// So we can always pick the port name from the ExternalName Service as at this point we
-				// know that those are valid names because Kuberentes already validated it once. Note
-				// that users could have changed an unnamed port to a named port and might have changed
-				// port names- this should still work.
+				// We want to both preserve the user set port names for ease of debugging, but also
+				// ensure that we name all unnamed ports as the ClusterIP Service that we create will
+				// always have at least two ports.
 				// https://kubernetes.io/docs/concepts/services-networking/service/#multi-port-services
 				// See also https://github.com/tailscale/tailscale/issues/13406#issuecomment-2507230388
-				clusterIPSvc.Spec.Ports[i].Name = wantsPM.Name
+				if wantsPM.Name != "" {
+					clusterIPSvc.Spec.Ports[i].Name = wantsPM.Name
+				} else {
+					clusterIPSvc.Spec.Ports[i].Name = "tailscale-unnamed"
+				}
 				found = true
 				break
 			}
@@ -252,6 +255,12 @@ func (esr *egressSvcsReconciler) provision(ctx context.Context, proxyGroupName s
 	// ClusterIP Service produce new target port and add a portmapping to
 	// the ClusterIP Service.
 	for _, wantsPM := range svc.Spec.Ports {
+		// Because we add a healthcheck port of our own, we will always have at least two ports. That
+		// means that we cannot have ports with name not set.
+		// https://kubernetes.io/docs/concepts/services-networking/service/#multi-port-services
+		if wantsPM.Name == "" {
+			wantsPM.Name = "tailscale-unnamed"
+		}
 		found := false
 		for _, gotPM := range clusterIPSvc.Spec.Ports {
 			if wantsPM.Port == gotPM.Port && strings.EqualFold(string(wantsPM.Protocol), string(gotPM.Protocol)) {
@@ -278,6 +287,25 @@ func (esr *egressSvcsReconciler) provision(ctx context.Context, proxyGroupName s
 			})
 		}
 	}
+	var healthCheckPort int32 = defaultLocalAddrPort
+
+	for {
+		if !slices.ContainsFunc(svc.Spec.Ports, func(p corev1.ServicePort) bool {
+			return p.Port == healthCheckPort
+		}) {
+			break
+		}
+		healthCheckPort++
+		if healthCheckPort > 10002 {
+			return nil, false, fmt.Errorf("unable to find a free port for internal health check in range [9002, 10002]")
+		}
+	}
+	clusterIPSvc.Spec.Ports = append(clusterIPSvc.Spec.Ports, corev1.ServicePort{
+		Name:       tsHealthCheckPortName,
+		Port:       healthCheckPort,
+		TargetPort: intstr.FromInt(defaultLocalAddrPort),
+		Protocol:   "TCP",
+	})
 	if !reflect.DeepEqual(clusterIPSvc, oldClusterIPSvc) {
 		if clusterIPSvc, err = createOrUpdate(ctx, esr.Client, esr.tsNamespace, clusterIPSvc, func(svc *corev1.Service) {
 			svc.Labels = clusterIPSvc.Labels
@@ -320,7 +348,7 @@ func (esr *egressSvcsReconciler) provision(ctx context.Context, proxyGroupName s
 	}
 	tailnetSvc := tailnetSvcName(svc)
 	gotCfg := (*cfgs)[tailnetSvc]
-	wantsCfg := egressSvcCfg(svc, clusterIPSvc)
+	wantsCfg := egressSvcCfg(svc, clusterIPSvc, esr.tsNamespace, l)
 	if !reflect.DeepEqual(gotCfg, wantsCfg) {
 		l.Debugf("updating egress services ConfigMap %s", cm.Name)
 		mak.Set(cfgs, tailnetSvc, wantsCfg)
@@ -504,10 +532,8 @@ func (esr *egressSvcsReconciler) validateClusterResources(ctx context.Context, s
 		return false, nil
 	}
 	if !tsoperator.ProxyGroupIsReady(pg) {
-		l.Infof("ProxyGroup %s is not ready, waiting...", proxyGroupName)
 		tsoperator.SetServiceCondition(svc, tsapi.EgressSvcValid, metav1.ConditionUnknown, reasonProxyGroupNotReady, reasonProxyGroupNotReady, esr.clock, l)
 		tsoperator.RemoveServiceCondition(svc, tsapi.EgressSvcConfigured)
-		return false, nil
 	}

 	l.Debugf("egress service is valid")
@@ -515,6 +541,24 @@ func (esr *egressSvcsReconciler) validateClusterResources(ctx context.Context, s
 	return true, nil
 }

+func egressSvcCfg(externalNameSvc, clusterIPSvc *corev1.Service, ns string, l *zap.SugaredLogger) egressservices.Config {
+	d := retrieveClusterDomain(ns, l)
+	tt := tailnetTargetFromSvc(externalNameSvc)
+	hep := healthCheckForSvc(clusterIPSvc, d)
+	cfg := egressservices.Config{
+		TailnetTarget:       tt,
+		HealthCheckEndpoint: hep,
+	}
+	for _, svcPort := range clusterIPSvc.Spec.Ports {
+		if svcPort.Name == tsHealthCheckPortName {
+			continue // exclude healthcheck from egress svcs configs
+		}
+		pm := portMap(svcPort)
+		mak.Set(&cfg.Ports, pm, struct{}{})
+	}
+	return cfg
+}
+
 func validateEgressService(svc *corev1.Service, pg *tsapi.ProxyGroup) []string {
 	violations := validateService(svc)

@@ -584,16 +628,6 @@ func tailnetTargetFromSvc(svc *corev1.Service) egressservices.TailnetTarget {
 	}
 }

-func egressSvcCfg(externalNameSvc, clusterIPSvc *corev1.Service) egressservices.Config {
-	tt := tailnetTargetFromSvc(externalNameSvc)
-	cfg := egressservices.Config{TailnetTarget: tt}
-	for _, svcPort := range clusterIPSvc.Spec.Ports {
-		pm := portMap(svcPort)
-		mak.Set(&cfg.Ports, pm, struct{}{})
-	}
-	return cfg
-}
-
 func portMap(p corev1.ServicePort) egressservices.PortMap {
 	// TODO (irbekrm): out of bounds check?
 	return egressservices.PortMap{Protocol: string(p.Protocol), MatchPort: uint16(p.TargetPort.IntVal), TargetPort: uint16(p.Port)}
@@ -618,7 +652,11 @@ func egressSvcsConfigs(ctx context.Context, cl client.Client, proxyGroupName, ts
 			Namespace: tsNamespace,
 		},
 	}
-	if err := cl.Get(ctx, client.ObjectKeyFromObject(cm), cm); err != nil {
+	err = cl.Get(ctx, client.ObjectKeyFromObject(cm), cm)
+	if apierrors.IsNotFound(err) { // ProxyGroup resources have not been created (yet)
+		return nil, nil, nil
+	}
+	if err != nil {
 		return nil, nil, fmt.Errorf("error retrieving egress services ConfigMap %s: %v", name, err)
 	}
 	cfgs = &egressservices.Configs{}
@@ -740,3 +778,17 @@ func (esr *egressSvcsReconciler) updateSvcSpec(ctx context.Context, svc *corev1.
 	svc.Status = *st
 	return err
 }
+
+// healthCheckForSvc return the URL of the containerboot's health check endpoint served by this Service or empty string.
+func healthCheckForSvc(svc *corev1.Service, clusterDomain string) string {
+	// This version of the operator always sets health check port on the egress Services. However, it is possible
+	// that this reconcile loops runs during a proxy upgrade from a version that did not set the health check port
+	// and parses a Service that does not have the port set yet.
+	i := slices.IndexFunc(svc.Spec.Ports, func(port corev1.ServicePort) bool {
+		return port.Name == tsHealthCheckPortName
+	})
+	if i == -1 {
+		return ""
+	}
+	return fmt.Sprintf("http://%s.%s.svc.%s:%d/healthz", svc.Name, svc.Namespace, clusterDomain, svc.Spec.Ports[i].Port)
+}
--- a/cmd/k8s-operator/egress-services_test.go
+++ b/cmd/k8s-operator/egress-services_test.go
@@ -18,6 +18,7 @@ import (
 	discoveryv1 "k8s.io/api/discovery/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 	tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
@@ -78,42 +79,16 @@ func TestTailscaleEgressServices(t *testing.T) {
 			Selector:     nil,
 			Ports: []corev1.ServicePort{
 				{
-					Name:     "http",
 					Protocol: "TCP",
 					Port:     80,
 				},
-				{
-					Name:     "https",
-					Protocol: "TCP",
-					Port:     443,
-				},
 			},
 		},
 	}

-	t.Run("proxy_group_not_ready", func(t *testing.T) {
+	t.Run("service_one_unnamed_port", func(t *testing.T) {
 		mustCreate(t, fc, svc)
 		expectReconciled(t, esr, "default", "test")
-		// Service should have EgressSvcValid condition set to Unknown.
-		svc.Status.Conditions = []metav1.Condition{condition(tsapi.EgressSvcValid, metav1.ConditionUnknown, reasonProxyGroupNotReady, reasonProxyGroupNotReady, clock)}
-		expectEqual(t, fc, svc)
-	})
-
-	t.Run("proxy_group_ready", func(t *testing.T) {
-		mustUpdateStatus(t, fc, "", "foo", func(pg *tsapi.ProxyGroup) {
-			pg.Status.Conditions = []metav1.Condition{
-				condition(tsapi.ProxyGroupReady, metav1.ConditionTrue, "", "", clock),
-			}
-		})
-		expectReconciled(t, esr, "default", "test")
-		validateReadyService(t, fc, esr, svc, clock, zl, cm)
-	})
-	t.Run("service_retain_one_unnamed_port", func(t *testing.T) {
-		svc.Spec.Ports = []corev1.ServicePort{{Protocol: "TCP", Port: 80}}
-		mustUpdate(t, fc, "default", "test", func(s *corev1.Service) {
-			s.Spec.Ports = svc.Spec.Ports
-		})
-		expectReconciled(t, esr, "default", "test")
 		validateReadyService(t, fc, esr, svc, clock, zl, cm)
 	})
 	t.Run("service_add_two_named_ports", func(t *testing.T) {
@@ -164,7 +139,7 @@ func validateReadyService(t *testing.T, fc client.WithWatch, esr *egressSvcsReco
 	// Verify that an EndpointSlice has been created.
 	expectEqual(t, fc, endpointSlice(name, svc, clusterSvc))
 	// Verify that ConfigMap contains configuration for the new egress service.
-	mustHaveConfigForSvc(t, fc, svc, clusterSvc, cm)
+	mustHaveConfigForSvc(t, fc, svc, clusterSvc, cm, zl)
 	r := svcConfiguredReason(svc, true, zl.Sugar())
 	// Verify that the user-created ExternalName Service has Configured set to true and ExternalName pointing to the
 	// CluterIP Service.
@@ -203,6 +178,23 @@ func findGenNameForEgressSvcResources(t *testing.T, client client.Client, svc *c

 func clusterIPSvc(name string, extNSvc *corev1.Service) *corev1.Service {
 	labels := egressSvcChildResourceLabels(extNSvc)
+	ports := make([]corev1.ServicePort, len(extNSvc.Spec.Ports))
+	for i, port := range extNSvc.Spec.Ports {
+		ports[i] = corev1.ServicePort{ // Copy the port to avoid modifying the original.
+			Name:     port.Name,
+			Port:     port.Port,
+			Protocol: port.Protocol,
+		}
+		if port.Name == "" {
+			ports[i].Name = "tailscale-unnamed"
+		}
+	}
+	ports = append(ports, corev1.ServicePort{
+		Name:       "tailscale-health-check",
+		Port:       9002,
+		TargetPort: intstr.FromInt(9002),
+		Protocol:   "TCP",
+	})
 	return &corev1.Service{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:         name,
@@ -212,7 +204,7 @@ func clusterIPSvc(name string, extNSvc *corev1.Service) *corev1.Service {
 		},
 		Spec: corev1.ServiceSpec{
 			Type:  corev1.ServiceTypeClusterIP,
-			Ports: extNSvc.Spec.Ports,
+			Ports: ports,
 		},
 	}
 }
@@ -257,9 +249,9 @@ func portsForEndpointSlice(svc *corev1.Service) []discoveryv1.EndpointPort {
 	return ports
 }

-func mustHaveConfigForSvc(t *testing.T, cl client.Client, extNSvc, clusterIPSvc *corev1.Service, cm *corev1.ConfigMap) {
+func mustHaveConfigForSvc(t *testing.T, cl client.Client, extNSvc, clusterIPSvc *corev1.Service, cm *corev1.ConfigMap, l *zap.Logger) {
 	t.Helper()
-	wantsCfg := egressSvcCfg(extNSvc, clusterIPSvc)
+	wantsCfg := egressSvcCfg(extNSvc, clusterIPSvc, clusterIPSvc.Namespace, l.Sugar())
 	if err := cl.Get(context.Background(), client.ObjectKeyFromObject(cm), cm); err != nil {
 		t.Fatalf("Error retrieving ConfigMap: %v", err)
 	}
--- a/cmd/k8s-operator/operator.go
+++ b/cmd/k8s-operator/operator.go
@@ -777,7 +777,7 @@ func proxyClassHandlerForConnector(cl client.Client, logger *zap.SugaredLogger)
 	}
 }

-// proxyClassHandlerForConnector returns a handler that, for a given ProxyClass,
+// proxyClassHandlerForProxyGroup returns a handler that, for a given ProxyClass,
 // returns a list of reconcile requests for all Connectors that have
 // .spec.proxyClass set.
 func proxyClassHandlerForProxyGroup(cl client.Client, logger *zap.SugaredLogger) handler.MapFunc {
@@ -998,7 +998,7 @@ func reconcileRequestsForPG(pg string, cl client.Client, ns string) []reconcile.
 // egressSvcsFromEgressProxyGroup is an event handler for egress ProxyGroups. It returns reconcile requests for all
 // user-created ExternalName Services that should be exposed on this ProxyGroup.
 func egressSvcsFromEgressProxyGroup(cl client.Client, logger *zap.SugaredLogger) handler.MapFunc {
-	return func(_ context.Context, o client.Object) []reconcile.Request {
+	return func(ctx context.Context, o client.Object) []reconcile.Request {
 		pg, ok := o.(*tsapi.ProxyGroup)
 		if !ok {
 			logger.Infof("[unexpected] ProxyGroup handler triggered for an object that is not a ProxyGroup")
@@ -1008,7 +1008,7 @@ func egressSvcsFromEgressProxyGroup(cl client.Client, logger *zap.SugaredLogger)
 			return nil
 		}
 		svcList := &corev1.ServiceList{}
-		if err := cl.List(context.Background(), svcList, client.MatchingFields{indexEgressProxyGroup: pg.Name}); err != nil {
+		if err := cl.List(ctx, svcList, client.MatchingFields{indexEgressProxyGroup: pg.Name}); err != nil {
 			logger.Infof("error listing Services: %v, skipping a reconcile for event on ProxyGroup %s", err, pg.Name)
 			return nil
 		}
@@ -1028,7 +1028,7 @@ func egressSvcsFromEgressProxyGroup(cl client.Client, logger *zap.SugaredLogger)
 // epsFromExternalNameService is an event handler for ExternalName Services that define a Tailscale egress service that
 // should be exposed on a ProxyGroup. It returns reconcile requests for EndpointSlices created for this Service.
 func epsFromExternalNameService(cl client.Client, logger *zap.SugaredLogger, ns string) handler.MapFunc {
-	return func(_ context.Context, o client.Object) []reconcile.Request {
+	return func(ctx context.Context, o client.Object) []reconcile.Request {
 		svc, ok := o.(*corev1.Service)
 		if !ok {
 			logger.Infof("[unexpected] Service handler triggered for an object that is not a Service")
@@ -1038,7 +1038,7 @@ func epsFromExternalNameService(cl client.Client, logger *zap.SugaredLogger, ns
 			return nil
 		}
 		epsList := &discoveryv1.EndpointSliceList{}
-		if err := cl.List(context.Background(), epsList, client.InNamespace(ns),
+		if err := cl.List(ctx, epsList, client.InNamespace(ns),
 			client.MatchingLabels(egressSvcChildResourceLabels(svc))); err != nil {
 			logger.Infof("error listing EndpointSlices: %v, skipping a reconcile for event on Service %s", err, svc.Name)
 			return nil
--- a/cmd/k8s-operator/proxygroup.go
+++ b/cmd/k8s-operator/proxygroup.go
@@ -32,6 +32,7 @@ import (
 	"tailscale.com/ipn"
 	tsoperator "tailscale.com/k8s-operator"
 	tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
+	"tailscale.com/kube/egressservices"
 	"tailscale.com/kube/kubetypes"
 	"tailscale.com/tailcfg"
 	"tailscale.com/tstime"
@@ -166,6 +167,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
 			r.recorder.Eventf(pg, corev1.EventTypeWarning, reasonProxyGroupCreationFailed, err.Error())
 			return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreationFailed, err.Error())
 		}
+		validateProxyClassForPG(logger, pg, proxyClass)
 		if !tsoperator.ProxyClassIsReady(proxyClass) {
 			message := fmt.Sprintf("the ProxyGroup's ProxyClass %s is not yet in a ready state, waiting...", proxyClassName)
 			logger.Info(message)
@@ -204,6 +206,31 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
 	return setStatusReady(pg, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady)
 }

+// validateProxyClassForPG applies custom validation logic for ProxyClass applied to ProxyGroup.
+func validateProxyClassForPG(logger *zap.SugaredLogger, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) {
+	if pg.Spec.Type == tsapi.ProxyGroupTypeIngress {
+		return
+	}
+	// Our custom logic for ensuring minimum downtime ProxyGroup update rollouts relies on the local health check
+	// beig accessible on the replica Pod IP:9002. This address can also be modified by users, via
+	// TS_LOCAL_ADDR_PORT env var.
+	//
+	// Currently TS_LOCAL_ADDR_PORT controls Pod's health check and metrics address. _Probably_ there is no need for
+	// users to set this to a custom value. Users who want to consume metrics, should integrate with the metrics
+	// Service and/or ServiceMonitor, rather than Pods directly. The health check is likely not useful to integrate
+	// directly with for operator proxies (and we should aim for unified lifecycle logic in the operator, users
+	// shouldn't need to set their own).
+	//
+	// TODO(irbekrm): maybe disallow configuring this env var in future (in Tailscale 1.84 or later).
+	if hasLocalAddrPortSet(pc) {
+		msg := fmt.Sprintf("ProxyClass %s applied to an egress ProxyGroup has TS_LOCAL_ADDR_PORT env var set to a custom value."+
+			"This will disable the ProxyGroup graceful failover mechanism, so you might experience downtime when ProxyGroup pods are restarted."+
+			"In future we will remove the ability to set custom TS_LOCAL_ADDR_PORT for egress ProxyGroups."+
+			"Please raise an issue if you expect that this will cause issues for your workflow.", pc.Name)
+		logger.Warn(msg)
+	}
+}
+
 func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) error {
 	logger := r.logger(pg.Name)
 	r.mu.Lock()
@@ -253,10 +280,11 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, pg *tsapi.Pro
 		return fmt.Errorf("error provisioning RoleBinding: %w", err)
 	}
 	if pg.Spec.Type == tsapi.ProxyGroupTypeEgress {
-		cm := pgEgressCM(pg, r.tsNamespace)
+		cm, hp := pgEgressCM(pg, r.tsNamespace)
 		if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, cm, func(existing *corev1.ConfigMap) {
 			existing.ObjectMeta.Labels = cm.ObjectMeta.Labels
 			existing.ObjectMeta.OwnerReferences = cm.ObjectMeta.OwnerReferences
+			mak.Set(&existing.BinaryData, egressservices.KeyHEPPings, hp)
 		}); err != nil {
 			return fmt.Errorf("error provisioning egress ConfigMap %q: %w", cm.Name, err)
 		}
@@ -270,7 +298,7 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, pg *tsapi.Pro
 			return fmt.Errorf("error provisioning ingress ConfigMap %q: %w", cm.Name, err)
 		}
 	}
-	ss, err := pgStatefulSet(pg, r.tsNamespace, r.proxyImage, r.tsFirewallMode)
+	ss, err := pgStatefulSet(pg, r.tsNamespace, r.proxyImage, r.tsFirewallMode, proxyClass)
 	if err != nil {
 		return fmt.Errorf("error generating StatefulSet spec: %w", err)
 	}
--- a/cmd/k8s-operator/proxygroup_specs.go
+++ b/cmd/k8s-operator/proxygroup_specs.go
@@ -7,11 +7,14 @@ package main

 import (
 	"fmt"
+	"slices"
+	"strconv"

 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"sigs.k8s.io/yaml"
 	tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
 	"tailscale.com/kube/egressservices"
@@ -19,9 +22,12 @@ import (
 	"tailscale.com/types/ptr"
 )

+// deletionGracePeriodSeconds is set to 6 minutes to ensure that the pre-stop hook of these proxies have enough chance to terminate gracefully.
+const deletionGracePeriodSeconds int64 = 360
+
 // Returns the base StatefulSet definition for a ProxyGroup. A ProxyClass may be
 // applied over the top after.
-func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string) (*appsv1.StatefulSet, error) {
+func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string, proxyClass *tsapi.ProxyClass) (*appsv1.StatefulSet, error) {
 	ss := new(appsv1.StatefulSet)
 	if err := yaml.Unmarshal(proxyYaml, &ss); err != nil {
 		return nil, fmt.Errorf("failed to unmarshal proxy spec: %w", err)
@@ -145,15 +151,25 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string
 		}

 		if pg.Spec.Type == tsapi.ProxyGroupTypeEgress {
-			envs = append(envs, corev1.EnvVar{
-				Name:  "TS_EGRESS_SERVICES_CONFIG_PATH",
-				Value: fmt.Sprintf("/etc/proxies/%s", egressservices.KeyEgressServices),
-			},
+			envs = append(envs,
+				// TODO(irbekrm): in 1.80 we deprecated TS_EGRESS_SERVICES_CONFIG_PATH in favour of
+				// TS_EGRESS_PROXIES_CONFIG_PATH. Remove it in 1.84.
+				corev1.EnvVar{
+					Name:  "TS_EGRESS_SERVICES_CONFIG_PATH",
+					Value: fmt.Sprintf("/etc/proxies/%s", egressservices.KeyEgressServices),
+				},
+				corev1.EnvVar{
+					Name:  "TS_EGRESS_PROXIES_CONFIG_PATH",
+					Value: "/etc/proxies",
+				},
 				corev1.EnvVar{
 					Name:  "TS_INTERNAL_APP",
 					Value: kubetypes.AppProxyGroupEgress,
 				},
-			)
+				corev1.EnvVar{
+					Name:  "TS_ENABLE_HEALTH_CHECK",
+					Value: "true",
+				})
 		} else { // ingress
 			envs = append(envs, corev1.EnvVar{
 				Name:  "TS_INTERNAL_APP",
@@ -167,6 +183,25 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string
 		return append(c.Env, envs...)
 	}()

+	// The pre-stop hook is used to ensure that a replica does not get terminated while cluster traffic for egress
+	// services is still being routed to it.
+	//
+	// This mechanism currently (2025-01-26) rely on the local health check being accessible on the Pod's
+	// IP, so they are not supported for ProxyGroups where users have configured TS_LOCAL_ADDR_PORT to a custom
+	// value.
+	if pg.Spec.Type == tsapi.ProxyGroupTypeEgress && !hasLocalAddrPortSet(proxyClass) {
+		c.Lifecycle = &corev1.Lifecycle{
+			PreStop: &corev1.LifecycleHandler{
+				HTTPGet: &corev1.HTTPGetAction{
+					Path: kubetypes.EgessServicesPreshutdownEP,
+					Port: intstr.FromInt(defaultLocalAddrPort),
+				},
+			},
+		}
+		// Set the deletion grace period to 6 minutes to ensure that the pre-stop hook has enough time to terminate
+		// gracefully.
+		ss.Spec.Template.DeletionGracePeriodSeconds = ptr.To(deletionGracePeriodSeconds)
+	}
 	return ss, nil
 }

@@ -258,7 +293,9 @@ func pgStateSecrets(pg *tsapi.ProxyGroup, namespace string) (secrets []*corev1.S
 	return secrets
 }

-func pgEgressCM(pg *tsapi.ProxyGroup, namespace string) *corev1.ConfigMap {
+func pgEgressCM(pg *tsapi.ProxyGroup, namespace string) (*corev1.ConfigMap, []byte) {
+	hp := hepPings(pg)
+	hpBs := []byte(strconv.Itoa(hp))
 	return &corev1.ConfigMap{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:            pgEgressCMName(pg.Name),
@@ -266,8 +303,10 @@ func pgEgressCM(pg *tsapi.ProxyGroup, namespace string) *corev1.ConfigMap {
 			Labels:          pgLabels(pg.Name, nil),
 			OwnerReferences: pgOwnerReference(pg),
 		},
-	}
+		BinaryData: map[string][]byte{egressservices.KeyHEPPings: hpBs},
+	}, hpBs
 }
+
 func pgIngressCM(pg *tsapi.ProxyGroup, namespace string) *corev1.ConfigMap {
 	return &corev1.ConfigMap{
 		ObjectMeta: metav1.ObjectMeta{
@@ -313,3 +352,23 @@ func pgReplicas(pg *tsapi.ProxyGroup) int32 {
 func pgEgressCMName(pg string) string {
 	return fmt.Sprintf("%s-egress-config", pg)
 }
+
+// hasLocalAddrPortSet returns true if the proxyclass has the TS_LOCAL_ADDR_PORT env var set. For egress ProxyGroups,
+// currently (2025-01-26) this means that the ProxyGroup does not support graceful failover.
+func hasLocalAddrPortSet(proxyClass *tsapi.ProxyClass) bool {
+	if proxyClass == nil || proxyClass.Spec.StatefulSet == nil || proxyClass.Spec.StatefulSet.Pod == nil || proxyClass.Spec.StatefulSet.Pod.TailscaleContainer == nil {
+		return false
+	}
+	return slices.ContainsFunc(proxyClass.Spec.StatefulSet.Pod.TailscaleContainer.Env, func(env tsapi.Env) bool {
+		return env.Name == envVarTSLocalAddrPort
+	})
+}
+
+// hepPings returns the number of times a health check endpoint exposed by a Service fronting ProxyGroup replicas should
+// be pinged to ensure that all currently configured backend replicas are hit.
+func hepPings(pg *tsapi.ProxyGroup) int {
+	rc := pgReplicas(pg)
+	// Assuming a Service implemented using round robin load balancing, number-of-replica-times should be enough, but in
+	// practice, we cannot assume that the requests will be load balanced perfectly.
+	return int(rc) * 3
+}
--- a/cmd/k8s-operator/proxygroup_test.go
+++ b/cmd/k8s-operator/proxygroup_test.go
@@ -19,13 +19,13 @@ import (
 	rbacv1 "k8s.io/api/rbac/v1"
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/client-go/tools/record"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 	"tailscale.com/client/tailscale"
 	tsoperator "tailscale.com/k8s-operator"
 	tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
-	"tailscale.com/kube/egressservices"
 	"tailscale.com/kube/kubetypes"
 	"tailscale.com/tstest"
 	"tailscale.com/types/ptr"
@@ -97,7 +97,7 @@ func TestProxyGroup(t *testing.T) {

 		tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "the ProxyGroup's ProxyClass default-pc is not yet in a ready state, waiting...", 0, cl, zl.Sugar())
 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, false, "")
+		expectProxyGroupResources(t, fc, pg, false, "", pc)
 	})

 	t.Run("observe_ProxyGroupCreating_status_reason", func(t *testing.T) {
@@ -118,11 +118,11 @@ func TestProxyGroup(t *testing.T) {

 		tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "0/2 ProxyGroup pods running", 0, cl, zl.Sugar())
 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, true, "")
+		expectProxyGroupResources(t, fc, pg, true, "", pc)
 		if expected := 1; reconciler.egressProxyGroups.Len() != expected {
 			t.Fatalf("expected %d egress ProxyGroups, got %d", expected, reconciler.egressProxyGroups.Len())
 		}
-		expectProxyGroupResources(t, fc, pg, true, "")
+		expectProxyGroupResources(t, fc, pg, true, "", pc)
 		keyReq := tailscale.KeyCapabilities{
 			Devices: tailscale.KeyDeviceCapabilities{
 				Create: tailscale.KeyDeviceCreateCapabilities{
@@ -154,7 +154,7 @@ func TestProxyGroup(t *testing.T) {
 		}
 		tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady, 0, cl, zl.Sugar())
 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
+		expectProxyGroupResources(t, fc, pg, true, initialCfgHash, pc)
 	})

 	t.Run("scale_up_to_3", func(t *testing.T) {
@@ -165,7 +165,7 @@ func TestProxyGroup(t *testing.T) {
 		expectReconciled(t, reconciler, "", pg.Name)
 		tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "2/3 ProxyGroup pods running", 0, cl, zl.Sugar())
 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
+		expectProxyGroupResources(t, fc, pg, true, initialCfgHash, pc)

 		addNodeIDToStateSecrets(t, fc, pg)
 		expectReconciled(t, reconciler, "", pg.Name)
@@ -175,7 +175,7 @@ func TestProxyGroup(t *testing.T) {
 			TailnetIPs: []string{"1.2.3.4", "::1"},
 		})
 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
+		expectProxyGroupResources(t, fc, pg, true, initialCfgHash, pc)
 	})

 	t.Run("scale_down_to_1", func(t *testing.T) {
@@ -188,7 +188,7 @@ func TestProxyGroup(t *testing.T) {

 		pg.Status.Devices = pg.Status.Devices[:1] // truncate to only the first device.
 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
+		expectProxyGroupResources(t, fc, pg, true, initialCfgHash, pc)
 	})

 	t.Run("trigger_config_change_and_observe_new_config_hash", func(t *testing.T) {
@@ -202,7 +202,7 @@ func TestProxyGroup(t *testing.T) {
 		expectReconciled(t, reconciler, "", pg.Name)

 		expectEqual(t, fc, pg)
-		expectProxyGroupResources(t, fc, pg, true, "518a86e9fae64f270f8e0ec2a2ea6ca06c10f725035d3d6caca132cd61e42a74")
+		expectProxyGroupResources(t, fc, pg, true, "518a86e9fae64f270f8e0ec2a2ea6ca06c10f725035d3d6caca132cd61e42a74", pc)
 	})

 	t.Run("enable_metrics", func(t *testing.T) {
@@ -246,12 +246,29 @@ func TestProxyGroup(t *testing.T) {
 		// The fake client does not clean up objects whose owner has been
 		// deleted, so we can't test for the owned resources getting deleted.
 	})
+
 }

 func TestProxyGroupTypes(t *testing.T) {
+	pc := &tsapi.ProxyClass{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:       "test",
+			Generation: 1,
+		},
+		Spec: tsapi.ProxyClassSpec{},
+	}
 	fc := fake.NewClientBuilder().
 		WithScheme(tsapi.GlobalScheme).
+		WithObjects(pc).
+		WithStatusSubresource(pc).
 		Build()
+	mustUpdateStatus(t, fc, "", pc.Name, func(p *tsapi.ProxyClass) {
+		p.Status.Conditions = []metav1.Condition{{
+			Type:               string(tsapi.ProxyClassReady),
+			Status:             metav1.ConditionTrue,
+			ObservedGeneration: 1,
+		}}
+	})

 	zl, _ := zap.NewDevelopment()
 	reconciler := &ProxyGroupReconciler{
@@ -274,9 +291,7 @@ func TestProxyGroupTypes(t *testing.T) {
 				Replicas: ptr.To[int32](0),
 			},
 		}
-		if err := fc.Create(context.Background(), pg); err != nil {
-			t.Fatal(err)
-		}
+		mustCreate(t, fc, pg)

 		expectReconciled(t, reconciler, "", pg.Name)
 		verifyProxyGroupCounts(t, reconciler, 0, 1)
@@ -286,7 +301,8 @@ func TestProxyGroupTypes(t *testing.T) {
 			t.Fatalf("failed to get StatefulSet: %v", err)
 		}
 		verifyEnvVar(t, sts, "TS_INTERNAL_APP", kubetypes.AppProxyGroupEgress)
-		verifyEnvVar(t, sts, "TS_EGRESS_SERVICES_CONFIG_PATH", fmt.Sprintf("/etc/proxies/%s", egressservices.KeyEgressServices))
+		verifyEnvVar(t, sts, "TS_EGRESS_PROXIES_CONFIG_PATH", "/etc/proxies")
+		verifyEnvVar(t, sts, "TS_ENABLE_HEALTH_CHECK", "true")

 		// Verify that egress configuration has been set up.
 		cm := &corev1.ConfigMap{}
@@ -323,6 +339,57 @@ func TestProxyGroupTypes(t *testing.T) {
 		if diff := cmp.Diff(expectedVolumeMounts, sts.Spec.Template.Spec.Containers[0].VolumeMounts); diff != "" {
 			t.Errorf("unexpected volume mounts (-want +got):\n%s", diff)
 		}
+
+		expectedLifecycle := corev1.Lifecycle{
+			PreStop: &corev1.LifecycleHandler{
+				HTTPGet: &corev1.HTTPGetAction{
+					Path: kubetypes.EgessServicesPreshutdownEP,
+					Port: intstr.FromInt(defaultLocalAddrPort),
+				},
+			},
+		}
+		if diff := cmp.Diff(expectedLifecycle, *sts.Spec.Template.Spec.Containers[0].Lifecycle); diff != "" {
+			t.Errorf("unexpected lifecycle (-want +got):\n%s", diff)
+		}
+		if *sts.Spec.Template.DeletionGracePeriodSeconds != deletionGracePeriodSeconds {
+			t.Errorf("unexpected deletion grace period seconds %d, want %d", *sts.Spec.Template.DeletionGracePeriodSeconds, deletionGracePeriodSeconds)
+		}
+	})
+	t.Run("egress_type_no_lifecycle_hook_when_local_addr_port_set", func(t *testing.T) {
+		pg := &tsapi.ProxyGroup{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "test-egress-no-lifecycle",
+				UID:  "test-egress-no-lifecycle-uid",
+			},
+			Spec: tsapi.ProxyGroupSpec{
+				Type:       tsapi.ProxyGroupTypeEgress,
+				Replicas:   ptr.To[int32](0),
+				ProxyClass: "test",
+			},
+		}
+		mustCreate(t, fc, pg)
+		mustUpdate(t, fc, "", pc.Name, func(p *tsapi.ProxyClass) {
+			p.Spec.StatefulSet = &tsapi.StatefulSet{
+				Pod: &tsapi.Pod{
+					TailscaleContainer: &tsapi.Container{
+						Env: []tsapi.Env{{
+							Name:  "TS_LOCAL_ADDR_PORT",
+							Value: "127.0.0.1:8080",
+						}},
+					},
+				},
+			}
+		})
+		expectReconciled(t, reconciler, "", pg.Name)
+
+		sts := &appsv1.StatefulSet{}
+		if err := fc.Get(context.Background(), client.ObjectKey{Namespace: tsNamespace, Name: pg.Name}, sts); err != nil {
+			t.Fatalf("failed to get StatefulSet: %v", err)
+		}
+
+		if sts.Spec.Template.Spec.Containers[0].Lifecycle != nil {
+			t.Error("lifecycle hook was set when TS_LOCAL_ADDR_PORT was configured via ProxyClass")
+		}
 	})

 	t.Run("ingress_type", func(t *testing.T) {
@@ -341,7 +408,7 @@ func TestProxyGroupTypes(t *testing.T) {
 		}

 		expectReconciled(t, reconciler, "", pg.Name)
-		verifyProxyGroupCounts(t, reconciler, 1, 1)
+		verifyProxyGroupCounts(t, reconciler, 1, 2)

 		sts := &appsv1.StatefulSet{}
 		if err := fc.Get(context.Background(), client.ObjectKey{Namespace: tsNamespace, Name: pg.Name}, sts); err != nil {
@@ -402,13 +469,13 @@ func verifyEnvVar(t *testing.T, sts *appsv1.StatefulSet, name, expectedValue str
 	t.Errorf("%s environment variable not found", name)
 }

-func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool, cfgHash string) {
+func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool, cfgHash string, proxyClass *tsapi.ProxyClass) {
 	t.Helper()

 	role := pgRole(pg, tsNamespace)
 	roleBinding := pgRoleBinding(pg, tsNamespace)
 	serviceAccount := pgServiceAccount(pg, tsNamespace)
-	statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto")
+	statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", proxyClass)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/cmd/k8s-operator/sts.go
+++ b/cmd/k8s-operator/sts.go
@@ -101,6 +101,9 @@ const (
 	proxyTypeIngressResource = "ingress_resource"
 	proxyTypeConnector       = "connector"
 	proxyTypeProxyGroup      = "proxygroup"
+
+	envVarTSLocalAddrPort = "TS_LOCAL_ADDR_PORT"
+	defaultLocalAddrPort  = 9002 // metrics and health check port
 )

 var (