tailscale/cmd/k8s-operator/egress-pod-readiness.go
Irbe Krumina 3f39211f98
cmd/k8s-operator: check that cluster traffic is routed to egress ProxyGroup Pod before marking it as ready (#14792)
This change builds on top of #14436 to ensure minimum downtime during egress ProxyGroup update rollouts:

- adds a readiness gate for ProxyGroup replicas that prevents kubelet from marking
the replica Pod as ready before a corresponding readiness condition has been added
to the Pod

- adds a reconciler that reconciles egress ProxyGroup Pods and, for each that is not ready,
if cluster traffic for relevant egress endpoints is routed via this Pod- if so add the
readiness condition to allow kubelet to mark the Pod as ready.

During the sequenced StatefulSet update rollouts kubelet does not restart
a Pod before the previous replica has been updated and marked as ready, so
ensuring that a replica is not marked as ready allows to avoid a temporary
post-update situation where all replicas have been restarted, but none of the
new ones are yet set up as an endpoint for the egress service, so cluster traffic is dropped.

Updates tailscale/tailscale#14326

Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2025-01-30 08:47:45 +00:00

275 lines
10 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
package main
import (
"context"
"errors"
"fmt"
"net/http"
"slices"
"strings"
"sync/atomic"
"time"
"go.uber.org/zap"
xslices "golang.org/x/exp/slices"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/kube/kubetypes"
"tailscale.com/logtail/backoff"
"tailscale.com/tstime"
"tailscale.com/util/httpm"
)
const tsEgressReadinessGate = "tailscale.com/egress-services"
// egressPodsReconciler is responsible for setting tailscale.com/egress-services condition on egress ProxyGroup Pods.
// The condition is used as a readiness gate for the Pod, meaning that kubelet will not mark the Pod as ready before the
// condition is set. The ProxyGroup StatefulSet updates are rolled out in such a way that no Pod is restarted, before
// the previous Pod is marked as ready, so ensuring that the Pod does not get marked as ready when it is not yet able to
// route traffic for egress service prevents downtime during restarts caused by no available endpoints left because
// every Pod has been recreated and is not yet added to endpoints.
// https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-readiness-gate
type egressPodsReconciler struct {
client.Client
logger *zap.SugaredLogger
tsNamespace string
clock tstime.Clock
httpClient doer // http client that can be set to a mock client in tests
maxBackoff time.Duration // max backoff period between health check calls
}
// Reconcile reconciles an egress ProxyGroup Pods on changes to those Pods and ProxyGroup EndpointSlices. It ensures
// that for each Pod who is ready to route traffic to all egress services for the ProxyGroup, the Pod has a
// tailscale.com/egress-services condition to set, so that kubelet will mark the Pod as ready.
//
// For the Pod to be ready
// to route traffic to the egress service, the kube proxy needs to have set up the Pod's IP as an endpoint for the
// ClusterIP Service corresponding to the egress service.
//
// Note that the endpoints for the ClusterIP Service are configured by the operator itself using custom
// EndpointSlices(egress-eps-reconciler), so the routing is not blocked on Pod's readiness.
//
// Each egress service has a corresponding ClusterIP Service, that exposes all user configured
// tailnet ports, as well as a health check port for the proxy.
//
// The reconciler calls the health check endpoint of each Service up to N number of times, where N is the number of
// replicas for the ProxyGroup x 3, and checks if the received response is healthy response from the Pod being reconciled.
//
// The health check response contains a header with the
// Pod's IP address- this is used to determine whether the response is received from this Pod.
//
// If the Pod does not appear to be serving the health check endpoint (pre-v1.80 proxies), the reconciler just sets the
// readiness condition for backwards compatibility reasons.
func (er *egressPodsReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) {
l := er.logger.With("Pod", req.NamespacedName)
l.Debugf("starting reconcile")
defer l.Debugf("reconcile finished")
pod := new(corev1.Pod)
err = er.Get(ctx, req.NamespacedName, pod)
if apierrors.IsNotFound(err) {
return reconcile.Result{}, nil
}
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to get Pod: %w", err)
}
if !pod.DeletionTimestamp.IsZero() {
l.Debugf("Pod is being deleted, do nothing")
return res, nil
}
if pod.Labels[LabelParentType] != proxyTypeProxyGroup {
l.Infof("[unexpected] reconciler called for a Pod that is not a ProxyGroup Pod")
return res, nil
}
// If the Pod does not have the readiness gate set, there is no need to add the readiness condition. In practice
// this will happen if the user has configured custom TS_LOCAL_ADDR_PORT, thus disabling the graceful failover.
if !slices.ContainsFunc(pod.Spec.ReadinessGates, func(r corev1.PodReadinessGate) bool {
return r.ConditionType == tsEgressReadinessGate
}) {
l.Debug("Pod does not have egress readiness gate set, skipping")
return res, nil
}
proxyGroupName := pod.Labels[LabelParentName]
pg := new(tsapi.ProxyGroup)
if err := er.Get(ctx, types.NamespacedName{Name: proxyGroupName}, pg); err != nil {
return res, fmt.Errorf("error getting ProxyGroup %q: %w", proxyGroupName, err)
}
if pg.Spec.Type != typeEgress {
l.Infof("[unexpected] reconciler called for %q ProxyGroup Pod", pg.Spec.Type)
return res, nil
}
// Get all ClusterIP Services for all egress targets exposed to cluster via this ProxyGroup.
lbls := map[string]string{
LabelManaged: "true",
labelProxyGroup: proxyGroupName,
labelSvcType: typeEgress,
}
svcs := &corev1.ServiceList{}
if err := er.List(ctx, svcs, client.InNamespace(er.tsNamespace), client.MatchingLabels(lbls)); err != nil {
return res, fmt.Errorf("error listing ClusterIP Services")
}
idx := xslices.IndexFunc(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == tsEgressReadinessGate
})
if idx != -1 {
l.Debugf("Pod is already ready, do nothing")
return res, nil
}
var routesMissing atomic.Bool
errChan := make(chan error, len(svcs.Items))
for _, svc := range svcs.Items {
s := svc
go func() {
ll := l.With("service_name", s.Name)
d := retrieveClusterDomain(er.tsNamespace, ll)
healthCheckAddr := healthCheckForSvc(&s, d)
if healthCheckAddr == "" {
ll.Debugf("ClusterIP Service does not expose a health check endpoint, unable to verify if routing is set up")
errChan <- nil
return
}
var routesSetup bool
bo := backoff.NewBackoff(s.Name, ll.Infof, er.maxBackoff)
for range numCalls(pgReplicas(pg)) {
if ctx.Err() != nil {
errChan <- nil
return
}
state, err := er.lookupPodRouteViaSvc(ctx, pod, healthCheckAddr, ll)
if err != nil {
errChan <- fmt.Errorf("error validating if routing has been set up for Pod: %w", err)
return
}
if state == healthy || state == cannotVerify {
routesSetup = true
break
}
if state == unreachable || state == unhealthy || state == podNotReady {
bo.BackOff(ctx, errors.New("backoff"))
}
}
if !routesSetup {
ll.Debugf("Pod is not yet configured as Service endpoint")
routesMissing.Store(true)
}
errChan <- nil
}()
}
for range len(svcs.Items) {
e := <-errChan
err = errors.Join(err, e)
}
if err != nil {
return res, fmt.Errorf("error verifying conectivity: %w", err)
}
if rm := routesMissing.Load(); rm {
l.Info("Pod is not yet added as an endpoint for all egress targets, waiting...")
return reconcile.Result{RequeueAfter: shortRequeue}, nil
}
if err := er.setPodReady(ctx, pod, l); err != nil {
return res, fmt.Errorf("error setting Pod as ready: %w", err)
}
return res, nil
}
func (er *egressPodsReconciler) setPodReady(ctx context.Context, pod *corev1.Pod, l *zap.SugaredLogger) error {
if slices.ContainsFunc(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == tsEgressReadinessGate
}) {
return nil
}
l.Infof("Pod is ready to route traffic to all egress targets")
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
Type: tsEgressReadinessGate,
Status: corev1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: er.clock.Now()},
})
return er.Status().Update(ctx, pod)
}
// healthCheckState is the result of a single request to an egress Service health check endpoint with a goal to hit a
// specific backend Pod.
type healthCheckState int8
const (
cannotVerify healthCheckState = iota // not verifiable for this setup (i.e earlier proxy version)
unreachable // no backends or another network error
notFound // hit another backend
unhealthy // not 200
podNotReady // Pod is not ready, i.e does not have an IP address yet
healthy // 200
)
// lookupPodRouteViaSvc attempts to reach a Pod using a health check endpoint served by a Service and returns the state of the health check.
func (er *egressPodsReconciler) lookupPodRouteViaSvc(ctx context.Context, pod *corev1.Pod, healthCheckAddr string, l *zap.SugaredLogger) (healthCheckState, error) {
if !slices.ContainsFunc(pod.Spec.Containers[0].Env, func(e corev1.EnvVar) bool {
return e.Name == "TS_ENABLE_HEALTH_CHECK" && e.Value == "true"
}) {
l.Debugf("Pod does not have health check enabled, unable to verify if it is currently routable via Service")
return cannotVerify, nil
}
wantsIP, err := podIPv4(pod)
if err != nil {
return -1, fmt.Errorf("error determining Pod's IP address: %w", err)
}
if wantsIP == "" {
return podNotReady, nil
}
ctx, cancel := context.WithTimeout(ctx, time.Second*3)
defer cancel()
req, err := http.NewRequestWithContext(ctx, httpm.GET, healthCheckAddr, nil)
if err != nil {
return -1, fmt.Errorf("error creating new HTTP request: %w", err)
}
// Do not re-use the same connection for the next request so to maximize the chance of hitting all backends equally.
req.Close = true
resp, err := er.httpClient.Do(req)
if err != nil {
// This is most likely because this is the first Pod and is not yet added to Service endoints. Other
// error types are possible, but checking for those would likely make the system too fragile.
return unreachable, nil
}
defer resp.Body.Close()
gotIP := resp.Header.Get(kubetypes.PodIPv4Header)
if gotIP == "" {
l.Debugf("Health check does not return Pod's IP header, unable to verify if Pod is currently routable via Service")
return cannotVerify, nil
}
if !strings.EqualFold(wantsIP, gotIP) {
return notFound, nil
}
if resp.StatusCode != http.StatusOK {
return unhealthy, nil
}
return healthy, nil
}
// numCalls return the number of times an endpoint on a ProxyGroup Service should be called till it can be safely
// assumed that, if none of the responses came back from a specific Pod then traffic for the Service is currently not
// being routed to that Pod. This assumes that traffic for the Service is routed via round robin, so
// InternalTrafficPolicy must be 'Cluster' and session affinity must be None.
func numCalls(replicas int32) int32 {
return replicas * 3
}
// doer is an interface for HTTP client that can be set to a mock client in tests.
type doer interface {
Do(*http.Request) (*http.Response, error)
}